Exemplo n.º 1
0
 def testAtomicityOfNonEmptyDirectoryRenames(self):
     for _ in range(100):
         parent = self._createTempDir(purpose='parent')
         child = os.path.join(parent, 'child')
         # Use processes (as opposed to threads) to prevent GIL from ordering things artificially
         pool = multiprocessing.Pool(processes=cpu_count())
         try:
             numTasks = cpu_count() * 10
             grandChildIds = pool.map_async(func=partial(
                 _testAtomicityOfNonEmptyDirectoryRenamesTask, parent,
                 child),
                                            iterable=list(range(numTasks)))
             grandChildIds = grandChildIds.get()
         finally:
             pool.close()
             pool.join()
         self.assertEqual(len(grandChildIds), numTasks)
         # Assert that we only had one winner
         grandChildIds = [n for n in grandChildIds if n is not None]
         self.assertEqual(len(grandChildIds), 1)
         # Assert that the winner's grandChild wasn't silently overwritten by a looser
         expectedGrandChildId = grandChildIds[0]
         actualGrandChild = os.path.join(child, 'grandChild')
         actualGrandChildId = os.stat(actualGrandChild).st_ino
         self.assertEqual(actualGrandChildId, expectedGrandChildId)
Exemplo n.º 2
0
def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
    # TODO: Only add options for the system the user is specifying?
    parser.add_argument("--batchSystem", dest="batchSystem", default=DEFAULT_BATCH_SYSTEM, choices=BATCH_SYSTEMS,
                        help=f"The type of batch system to run the job(s) with, currently can be one "
                             f"of {', '.join(BATCH_SYSTEMS)}. default={DEFAULT_BATCH_SYSTEM}")
    parser.add_argument("--disableHotDeployment", dest="disableAutoDeployment", action='store_true', default=None,
                        help="Hot-deployment was renamed to auto-deployment.  Option now redirects to "
                             "--disableAutoDeployment.  Left in for backwards compatibility.")
    parser.add_argument("--disableAutoDeployment", dest="disableAutoDeployment", action='store_true', default=None,
                        help="Should auto-deployment of the user script be deactivated? If True, the user "
                             "script/package should be present at the same location on all workers.  Default = False.")
    parser.add_argument("--maxLocalJobs", default=cpu_count(),
                        help=f"For batch systems that support a local queue for housekeeping jobs "
                             f"(Mesos, GridEngine, htcondor, lsf, slurm, torque).  Specifies the maximum "
                             f"number of these housekeeping jobs to run on the local system.  The default "
                             f"(equal to the number of cores) is a maximum of {cpu_count()} concurrent "
                             f"local housekeeping jobs.")
    parser.add_argument("--manualMemArgs", default=False, action='store_true', dest="manualMemArgs",
                        help="Do not add the default arguments: 'hv=MEMORY' & 'h_vmem=MEMORY' to the qsub "
                             "call, and instead rely on TOIL_GRIDGENGINE_ARGS to supply alternative arguments.  "
                             "Requires that TOIL_GRIDGENGINE_ARGS be set.")
    parser.add_argument("--runCwlInternalJobsOnWorkers", dest="runCwlInternalJobsOnWorkers", action='store_true',
                        default=None,
                        help="Whether to run CWL internal jobs (e.g. CWLScatter) on the worker nodes "
                             "instead of the primary node. If false (default), then all such jobs are run on "
                             "the primary node. Setting this to true can speed up the pipeline for very large "
                             "workflows with many sub-workflows and/or scatters, provided that the worker "
                             "pool is large enough.")

    add_parasol_options(parser)
    add_single_machine_options(parser)
    add_mesos_options(parser)
    add_slurm_options(parser)
    add_kubernetes_options(parser)
Exemplo n.º 3
0
    def _applianceCluster(self, mounts=None, numCores=None):
        """
        A context manager for creating and tearing down an appliance cluster.

        :param dict|None mounts: Dictionary mapping host paths to container paths. Both the leader
               and the worker container will be started with one -v argument per dictionary entry,
               as in -v KEY:VALUE.

               Beware that if KEY is a path to a directory, its entire content will be deleted
               when the cluster is torn down.

        :param int numCores: The number of cores to be offered by the Mesos agent process running
               in the worker container.

        :rtype: (ApplianceTestSupport.Appliance, ApplianceTestSupport.Appliance)

        :return: A tuple of the form `(leader, worker)` containing the Appliance instances
                 representing the respective appliance containers
        """
        if numCores is None:
            numCores = cpu_count()
        # The last container to stop (and the first to start) should clean the mounts.
        with self.LeaderThread(self, mounts, cleanMounts=True) as leader:
            with self.WorkerThread(self, mounts, numCores) as worker:
                yield leader, worker
Exemplo n.º 4
0
def setDefaultOptions(config):
    """
    Set default options for builtin batch systems. This is required if a Config
    object is not constructed from an Options object.
    """
    config.batchSystem = "singleMachine"
    config.disableAutoDeployment = False
    config.environment = {}
    config.statePollingWait = None  # if not set, will default to seconds in getWaitDuration()
    config.maxLocalJobs = cpu_count()
    config.manualMemArgs = False

    # parasol
    config.parasolCommand = 'parasol'
    config.parasolMaxBatches = 10000

    # single machine
    config.scale = 1
    config.linkImports = False
    config.moveExports = False

    # mesos
    config.mesosMasterAddress = '%s:5050' % getPublicIP()

    # Kubernetes
    config.kubernetesHostPath = None
Exemplo n.º 5
0
Arquivo: options.py Projeto: mr-c/toil
def set_batchsystem_config_defaults(config) -> None:
    """
    Set default options for builtin batch systems. This is required if a Config
    object is not constructed from an Options object.
    """
    config.batchSystem = "single_machine"
    config.disableAutoDeployment = False
    config.environment = {}
    config.statePollingWait = None
    config.maxLocalJobs = cpu_count()
    config.manualMemArgs = False
    config.coalesceStatusCalls = False

    # parasol
    config.parasolCommand = 'parasol'
    config.parasolMaxBatches = 10000

    # single machine
    config.scale = 1
    config.linkImports = False
    config.moveExports = False

    # mesos
    config.mesosMasterAddress = f'{getPublicIP()}:5050'

    # SLURM
    config.allocate_mem = True

    # Kubernetes
    config.kubernetesHostPath = None
Exemplo n.º 6
0
    def testLastProcessStanding(self):
        for it in range(10):
            log.info('Iteration %d', it)

            scope = self._createTempDir()
            arena_name = 'thunderdome'
            # Use processes (as opposed to threads) to prevent GIL from ordering things artificially
            pool = multiprocessing.Pool(processes=cpu_count())
            try:
                numTasks = 100
                results = pool.map_async(func=partial(
                    _testLastProcessStandingTask, scope, arena_name),
                                         iterable=list(range(numTasks)))
                results = results.get()
            finally:
                pool.close()
                pool.join()

            self.assertEqual(len(results), numTasks)
            for item in results:
                # Make sure all workers say they succeeded
                self.assertEqual(item, True)
            for filename in os.listdir(scope):
                assert not filename.startswith(
                    'precious'), f"File {filename} still exists"
Exemplo n.º 7
0
def addOptions(addOptionFn, config):
    addOptionFn(
        "--batchSystem",
        dest="batchSystem",
        default=defaultBatchSystem(),
        choices=uniqueNames(),
        help=
        ("The type of batch system to run the job(s) with, currently can be one "
         "of %s'. default=%s" %
         (', '.join(uniqueNames()), defaultBatchSystem())))
    addOptionFn(
        "--disableHotDeployment",
        dest="disableAutoDeployment",
        action='store_true',
        default=None,
        help=
        ("Hot-deployment was renamed to auto-deployment.  Option now redirects to "
         "--disableAutoDeployment.  Left in for backwards compatibility."))
    addOptionFn(
        "--disableAutoDeployment",
        dest="disableAutoDeployment",
        action='store_true',
        default=None,
        help=
        ("Should auto-deployment of the user script be deactivated? If True, the user "
         "script/package should be present at the same location on all workers. "
         "default=false"))
    localCores = cpu_count()
    addOptionFn("--maxLocalJobs",
                default=localCores,
                help="For batch systems that support a local queue for "
                "housekeeping jobs (Mesos, GridEngine, htcondor, lsf, slurm, "
                "torque), the maximum number of these housekeeping jobs to "
                "run on the local system. "
                "The default (equal to the number of cores) is a maximum of "
                "{} concurrent local housekeeping jobs.".format(localCores))
    addOptionFn(
        "--manualMemArgs",
        default=False,
        action='store_true',
        dest="manualMemArgs",
        help=
        "Do not add the default arguments: 'hv=MEMORY' & 'h_vmem=MEMORY' to "
        "the qsub call, and instead rely on TOIL_GRIDGENGINE_ARGS to supply "
        "alternative arguments.  Requires that TOIL_GRIDGENGINE_ARGS be set.")
    addOptionFn(
        "--runCwlInternalJobsOnWorkers",
        dest="runCwlInternalJobsOnWorkers",
        action='store_true',
        default=None,
        help=
        ("Whether to run CWL internal jobs (e.g. CWLScatter) on the worker nodes "
         "instead of the primary node. If false (default), then all such jobs are run on "
         "the primary node. Setting this to true can speed up the pipeline for very large "
         "workflows with many sub-workflows and/or scatters, provided that the worker "
         "pool is large enough."))

    for o in _options:
        o(addOptionFn, config)
Exemplo n.º 8
0
    class AbstractBatchSystemJobTest(with_metaclass(ABCMeta, ToilTest)):
        """
        An abstract base class for batch system tests that use a full Toil workflow rather
        than using the batch system directly.
        """

        cpuCount = cpu_count()
        allocatedCores = sorted({1, 2, cpuCount})
        sleepTime = 5

        @abstractmethod
        def getBatchSystemName(self):
            """
            :rtype: (str, AbstractBatchSystem)
            """
            raise NotImplementedError

        def getOptions(self, tempDir):
            """
            Configures options for Toil workflow and makes job store.
            :param str tempDir: path to test directory
            :return: Toil options object
            """
            options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
            options.logLevel = "DEBUG"
            options.batchSystem = self.batchSystemName
            options.workDir = tempDir
            options.maxCores = self.cpuCount
            return options

        def setUp(self):
            self.batchSystemName = self.getBatchSystemName()
            super(hidden.AbstractBatchSystemJobTest, self).setUp()

        def tearDown(self):
            super(hidden.AbstractBatchSystemJobTest, self).tearDown()

        @slow
        def testJobConcurrency(self):
            """
            Tests that the batch system is allocating core resources properly for concurrent tasks.
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                options = self.getOptions(tempDir)

                counterPath = os.path.join(tempDir, 'counter')
                resetCounters(counterPath)
                value, maxValue = getCounters(counterPath)
                assert (value, maxValue) == (0, 0)

                root = Job()
                for _ in range(self.cpuCount):
                    root.addFollowOn(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime,
                                                cores=coresPerJob, memory='1M', disk='1Mi'))
                Job.Runner.startToil(root, options)
                _, maxValue = getCounters(counterPath)
                self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
Exemplo n.º 9
0
def set_batchsystem_config_defaults(config) -> None:
    """
    Set default and environment-based options for builtin batch systems. This
    is required if a Config object is not constructed from an Options object.
    """

    # Do the global options across batch systems
    config.batchSystem = "single_machine"
    config.disableAutoDeployment = False
    config.maxLocalJobs = cpu_count()
    config.manualMemArgs = False
    config.coalesceStatusCalls = False
    config.statePollingWait: Optional[Union[
        float,
        int]] = None  # Number of seconds to wait before querying job state

    OptionType = TypeVar('OptionType')

    def set_option(option_name: str,
                   parsing_function: Optional[Callable[[Any],
                                                       OptionType]] = None,
                   check_function: Optional[Callable[[OptionType],
                                                     None]] = None,
                   default: Optional[OptionType] = None,
                   env: Optional[List[str]] = None,
                   old_names: Optional[List[str]] = None) -> None:
        """
        Function to set a batch-system-defined option to its default value, or
        one from the environment.
        """

        # TODO: deduplicate with Config

        option_value = default

        if env is not None:
            for env_var in env:
                # Try all the environment variables
                if option_value != default:
                    break
                option_value = os.environ.get(env_var, default)

        if option_value is not None or not hasattr(config, option_name):
            if parsing_function is not None:
                # Parse whatever it is (string, argparse-made list, etc.)
                option_value = parsing_function(option_value)
            if check_function is not None:
                try:
                    check_function(option_value)
                except AssertionError:
                    raise RuntimeError(
                        f"The {option_name} option has an invalid value: {option_value}"
                    )
            setattr(config, option_name, option_value)

    # Set up defaults from all the batch systems
    set_batchsystem_options(None, set_option)
Exemplo n.º 10
0
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "cpu",
                                 typeFn=int,
                                 default=1)
    mg_cores = min(mg_cores, cpu_count())

    # doing the paf conversion is more efficient when done separately for each genome.  we can get away
    # with doing this if the universal filter (which needs to process everything at once) is disabled
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter",
                                           float)

    # do the mapping
    gaf_id_map = {}
    paf_id_map = {}

    for event, fa_path_fa_id in fa_id_map.items():
        fa_path = fa_path_fa_id[0]
        fa_id = fa_path_fa_id[1]
        minigraph_map_job = top_job.addChildJobFn(
            minigraph_map_one,
            config,
            event,
            fa_path,
            fa_id,
            gfa_id,
            keep_gaf or not paf_per_genome,
            paf_per_genome,
            # todo: estimate RAM
            cores=mg_cores,
            disk=5 * (fa_id.size + gfa_id.size))
        gaf_id_map[event] = minigraph_map_job.rv(0)
        paf_id_map[event] = minigraph_map_job.rv(1)

    # convert to paf
    if paf_per_genome:
        paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map)
    else:
        paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config,
                                           gaf_id_map)

    if not keep_gaf:
        gaf_id_map = None
    else:
        gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv()

    return paf_job.rv(), gaf_id_map
Exemplo n.º 11
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--database", choices=["kyoto_tycoon", "redis"],
                        help="The type of database", default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))
Exemplo n.º 12
0
    def _startMesos(self, numCores=None):
        if numCores is None:
            numCores = cpu_count()
        shutil.rmtree('/tmp/mesos', ignore_errors=True)
        self.master = self.MesosMasterThread(numCores)
        self.master.start()
        self.agent = self.MesosAgentThread(numCores)
        self.agent.start()

        # Bad Things will happen if the master is not yet ready when Toil tries to use it.
        self.wait_for_master()

        log.info('Mesos is ready! Running test.')
Exemplo n.º 13
0
 def __init__(self, fastaID, minLength, dnabrnnOpts):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     # todo: clean up
     cores = cpu_count()
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.dnabrnnOpts = dnabrnnOpts
Exemplo n.º 14
0
 def __init__(self, blastOptions, seqFileID):
     disk = 3 * seqFileID.size
     memory = 3 * seqFileID.size
     if blastOptions.gpuLastz:
         # gpu jobs get the whole node
         cores = cpu_count()
     else:
         cores = None
     super(RunSelfBlast, self).__init__(memory=memory,
                                        disk=disk,
                                        cores=cores,
                                        preemptable=True)
     self.blastOptions = blastOptions
     self.seqFileID = seqFileID
Exemplo n.º 15
0
 def _startParasol(self, numCores=None, memory=None):
     if numCores is None:
         numCores = cpu_count()
     if memory is None:
         memory = physicalMemory()
     self.numCores = numCores
     self.memory = memory
     self.leader = self.ParasolLeaderThread()
     self.leader.start()
     self.worker = self.ParasolWorkerThread()
     self.worker.start()
     while self.leader.popen is None or self.worker.popen is None:
         log.info('Waiting for leader and worker processes')
         time.sleep(.1)
Exemplo n.º 16
0
 def __init__(self, repeatMaskOptions, queryID, targetIDs):
     targetsSize = sum(targetID.size for targetID in targetIDs)
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (queryID.size + targetsSize)
     if repeatMaskOptions.gpuLastz:
         # gpu jobs get the whole node (same hack as used in blast phase)
         cores = cpu_count()
     else:
         cores = None
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.queryID = queryID
     self.targetIDs = targetIDs
Exemplo n.º 17
0
 def _startMesos(self, numCores=None):
     if numCores is None:
         numCores = cpu_count()
     shutil.rmtree('/tmp/mesos', ignore_errors=True)
     self.master = self.MesosMasterThread(numCores)
     self.master.start()
     self.agent = self.MesosAgentThread(numCores)
     self.agent.start()
     
     # Wait for the master to come up.
     # Bad Things will happen if the master is not yet ready when Toil tries to use it.
     for attempt in retry(predicate=lambda e: True):
         with attempt:
             log.info('Checking if Mesos is ready...')
             with closing(urlopen('http://127.0.0.1:5050/version')) as content:
                 content.read()
     
     log.info('Mesos is ready! Running test.')
Exemplo n.º 18
0
    def testNewJobsCanHandleOtherJobDeaths(self):
        """
        Create 2 non-local files and then create 2 jobs. The first job registers a deferred job
        to delete the second non-local file, deletes the first non-local file and then kills
        itself.  The second job waits for the first file to be deleted, then sleeps for a few
        seconds and then spawns a child. the child of the second does nothing. However starting
        it should handle the untimely demise of the first job and run the registered deferred
        function that deletes the first file.  We assert the absence of the two files at the
        end of the run.
        """

        # Check to make sure we can run two jobs in parallel
        cpus = cpu_count()
        assert cpus >= 2, "Not enough CPUs to run two tasks at once"

        # There can be no retries
        self.options.retryCount = 0
        workdir = self._createTempDir(purpose='nonLocalDir')
        nonLocalFile1 = os.path.join(workdir, str(uuid4()))
        nonLocalFile2 = os.path.join(workdir, str(uuid4()))
        open(nonLocalFile1, 'w').close()
        open(nonLocalFile2, 'w').close()
        assert os.path.exists(nonLocalFile1)
        assert os.path.exists(nonLocalFile2)
        files = [nonLocalFile1, nonLocalFile2]
        root = Job()
        # A and B here must run in parallel for this to work
        A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A,
                          files=files,
                          cores=1)
        B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B,
                          files=files,
                          cores=1)
        C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C,
                          files=files,
                          expectedResult=False,
                          cores=1)
        root.addChild(A)
        root.addChild(B)
        B.addChild(C)
        try:
            Job.Runner.startToil(root, self.options)
        except FailedJobsException as e:
            pass
Exemplo n.º 19
0
 def __init__(self, blastOptions, seqFileID1, seqFileID2):
     if hasattr(seqFileID1, "size") and hasattr(seqFileID2, "size"):
         disk = 2 * (seqFileID1.size + seqFileID2.size)
         memory = 2 * (seqFileID1.size + seqFileID2.size)
     else:
         disk = None
         memory = None
     if blastOptions.gpuLastz:
         # gpu jobs get the whole node
         cores = cpu_count()
     else:
         cores = None
     super(RunBlast, self).__init__(memory=memory,
                                    disk=disk,
                                    cores=cores,
                                    preemptable=True)
     self.blastOptions = blastOptions
     self.seqFileID1 = seqFileID1
     self.seqFileID2 = seqFileID2
Exemplo n.º 20
0
 def __init__(self,
              fastaID,
              dnabrnnOpts,
              cpu,
              minLength=None,
              mergeLength=None,
              action=None):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     cores = min(cpu_count(), cpu)
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.mergeLength = mergeLength
     self.action = action
     self.dnabrnnOpts = dnabrnnOpts
Exemplo n.º 21
0
 def _sendFrameworkMessage(self, driver):
     message = None
     while True:
         # The psutil documentation recommends that we ignore the value returned by the first
         # invocation of cpu_percent(). However, we do want to send a sign of life early after
         # starting (e.g. to unblock the provisioner waiting for an instance to come up) so
         # we call it once and discard the value.
         if message is None:
             message = Expando(address=self.address)
             psutil.cpu_percent()
         else:
             message.nodeInfo = dict(coresUsed=float(psutil.cpu_percent()) * .01,
                                     memoryUsed=float(psutil.virtual_memory().percent) * .01,
                                     coresTotal=cpu_count(),
                                     memoryTotal=psutil.virtual_memory().total,
                                     workers=len(self.runningTasks))
         log.debug("Send framework message: %s", message)
         driver.sendFrameworkMessage(encode_data(repr(message).encode('utf-8')))
         # Prevent workers launched together from repeatedly hitting the leader at the same time
         time.sleep(random.randint(45, 75))
Exemplo n.º 22
0
 def __init__(self,
              fastaID,
              dnabrnnOpts,
              cpu,
              minLength=None,
              action=None,
              inputBedID=None,
              eventName=None):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     cores = min(cpu_count(), cpu)
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.action = action
     self.dnabrnnOpts = dnabrnnOpts
     self.inputBedID = inputBedID  #todo: moved to fileMasking --> remove from here
     self.eventName = eventName
Exemplo n.º 23
0
    def testGlobalMutexOrdering(self):
        for it in range(10):
            log.info('Iteration %d', it)

            scope = self._createTempDir()
            mutex = 'mutex'
            # Use processes (as opposed to threads) to prevent GIL from ordering things artificially
            pool = multiprocessing.Pool(processes=cpu_count())
            try:
                numTasks = 100
                results = pool.map_async(func=partial(
                    _testGlobalMutexOrderingTask, scope, mutex),
                                         iterable=list(range(numTasks)))
                results = results.get()
            finally:
                pool.close()
                pool.join()

            self.assertEqual(len(results), numTasks)
            for item in results:
                # Make sure all workers say they succeeded
                self.assertEqual(item, True)
Exemplo n.º 24
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="+",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outputHal", type=str, help="Output HAL file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Override some CAF settings whose defaults are not suited to star trees"
    )
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    if options.pafInput:
        # cactus-graphmap does not do any prepending to simplify interface with minigraph node names
        # so it must be done here
        options.nonCactusInput = True

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
Exemplo n.º 25
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="*",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outHal",
                        type=str,
                        help="Output HAL file (or directory in --batch mode)")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Pangenome Options
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings."
        " The overridden configuration will be saved in <outHal>.pg-conf.xml")
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument(
        "--usePafSecondaries",
        action="store_true",
        help=
        "use the secondary alignments from the PAF input.  They are ignored by default."
    )
    parser.add_argument("--singleCopySpecies",
                        type=str,
                        help="Filter out all self-alignments in given species")
    parser.add_argument(
        "--barMaskFilter",
        type=int,
        default=None,
        help=
        "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)"
    )
    parser.add_argument(
        "--outVG",
        action="store_true",
        help="export pangenome graph in VG (.vg) in addition to HAL")
    parser.add_argument(
        "--outGFA",
        action="store_true",
        help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL")
    parser.add_argument(
        "--batch",
        action="store_true",
        help=
        "Launch batch of alignments.  Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit"
    )
    parser.add_argument(
        "--stagger",
        type=int,
        help=
        "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)",
        default=0)
    parser.add_argument(
        "--acyclic",
        type=str,
        help=
        "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing"
    )

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    options.buildHal = True
    options.buildFasta = True

    if options.outHal.startswith('s3://'):
        if not has_s3:
            raise RuntimeError(
                "S3 support requires toil to be installed with [aws]")
        # write a little something to the bucket now to catch any glaring problems asap
        test_file = os.path.join(getTempDirectory(), 'check')
        with open(test_file, 'w') as test_o:
            test_o.write("\n")
        region = get_aws_region(
            options.jobStore) if options.jobStore.startswith('aws:') else None
        write_s3(test_file,
                 options.outHal if options.outHal.endswith('.hal') else
                 os.path.join(options.outHal, 'test'),
                 region=region)
        options.checkpointInfo = (get_aws_region(options.jobStore),
                                  options.outHal)
    else:
        options.checkpointInfo = None

    if options.batch:
        # the output hal is a directory, make sure it's there
        if not os.path.isdir(options.outHal):
            os.makedirs(options.outHal)
        assert len(options.cigarsFile) == 0
    else:
        assert len(options.cigarsFile) > 0

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # We set which type of unique ids to expect.  Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap)
    # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy
    # But I don't think there's a real use case yet of making a separate parameter
    options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID')
    if options.eventNameAsID is not None:
        options.eventNameAsID = False if not bool(
            eventName) or eventName == '0' else True
    else:
        options.eventNameAsID = options.pangenome or options.pafInput
    os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str(
        int(options.eventNameAsID))

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            align_jobs = make_batch_align_jobs(options, toil)
            results_dict = toil.start(
                Job.wrapJobFn(run_batch_align_jobs, align_jobs))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
            else:
                assert len(results_dict) == 1 and None in results_dict
                halID, vgID, gfaID = results_dict[None][0], results_dict[None][
                    1], results_dict[None][2]
                # export the hal
                toil.exportFile(halID, makeURL(options.outHal))
                # export the vg
                if options.outVG:
                    toil.exportFile(
                        vgID,
                        makeURL(os.path.splitext(options.outHal)[0] + '.vg'))
                if options.outGFA:
                    toil.exportFile(
                        gfaID,
                        makeURL(
                            os.path.splitext(options.outHal)[0] + '.gfa.gz'))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
Exemplo n.º 26
0
def add_all_batchsystem_options(
        parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
    # Do the global cross-batch-system arguments
    parser.add_argument(
        "--batchSystem",
        dest="batchSystem",
        default=DEFAULT_BATCH_SYSTEM,
        choices=BATCH_SYSTEMS,
        help=
        f"The type of batch system to run the job(s) with, currently can be one "
        f"of {', '.join(BATCH_SYSTEMS)}. default={DEFAULT_BATCH_SYSTEM}",
    )
    parser.add_argument(
        "--disableHotDeployment",
        dest="disableAutoDeployment",
        action="store_true",
        default=None,
        help=
        "Hot-deployment was renamed to auto-deployment.  Option now redirects to "
        "--disableAutoDeployment.  Left in for backwards compatibility.",
    )
    parser.add_argument(
        "--disableAutoDeployment",
        dest="disableAutoDeployment",
        action="store_true",
        default=None,
        help=
        "Should auto-deployment of the user script be deactivated? If True, the user "
        "script/package should be present at the same location on all workers.  Default = False.",
    )
    parser.add_argument(
        "--maxLocalJobs",
        default=cpu_count(),
        help=
        f"For batch systems that support a local queue for housekeeping jobs "
        f"(Mesos, GridEngine, htcondor, lsf, slurm, torque).  Specifies the maximum "
        f"number of these housekeeping jobs to run on the local system.  The default "
        f"(equal to the number of cores) is a maximum of {cpu_count()} concurrent "
        f"local housekeeping jobs.",
    )
    parser.add_argument(
        "--manualMemArgs",
        default=False,
        action="store_true",
        dest="manualMemArgs",
        help=
        "Do not add the default arguments: 'hv=MEMORY' & 'h_vmem=MEMORY' to the qsub "
        "call, and instead rely on TOIL_GRIDGENGINE_ARGS to supply alternative arguments.  "
        "Requires that TOIL_GRIDGENGINE_ARGS be set.",
    )
    parser.add_argument(
        "--runCwlInternalJobsOnWorkers",
        dest="runCwlInternalJobsOnWorkers",
        action="store_true",
        default=None,
        help=
        "Whether to run CWL internal jobs (e.g. CWLScatter) on the worker nodes "
        "instead of the primary node. If false (default), then all such jobs are run on "
        "the primary node. Setting this to true can speed up the pipeline for very large "
        "workflows with many sub-workflows and/or scatters, provided that the worker "
        "pool is large enough.",
    )
    parser.add_argument(
        "--coalesceStatusCalls",
        dest="coalesceStatusCalls",
        action="store_true",
        default=None,
        help=
        ("Coalese status calls to prevent the batch system from being overloaded. "
         "Currently only supported for LSF. "
         "default=false"),
    )
    parser.add_argument(
        "--statePollingWait",
        dest="statePollingWait",
        type=int,
        default=None,
        help=
        "Time, in seconds, to wait before doing a scheduler query for job state.  "
        "Return cached results if within the waiting period. Only works for grid "
        "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
    )

    for factory in BATCH_SYSTEM_FACTORY_REGISTRY.values():
        # All the batch systems are responsible for adding their own options
        # with the add_options class method.
        try:
            batch_system_type = factory()
        except ImportError:
            # Skip anything we can't import
            continue
        # Ask the batch system to create its options in the parser
        logger.debug('Add options for %s', batch_system_type)
        batch_system_type.add_options(parser)
Exemplo n.º 27
0
class SingleMachineBatchSystem(BatchSystemSupport):
    """
    The interface for running jobs on a single machine, runs all the jobs you
    give it as they come in, but in parallel.

    Uses a single "daddy" thread to manage a fleet of child processes.

    Communication with the daddy thread happens via two queues: one queue of
    jobs waiting to be run (the input queue), and one queue of jobs that are
    finished/stopped and need to be returned by getUpdatedBatchJob (the output
    queue).

    When the batch system is shut down, the daddy thread is stopped.

    If running in debug-worker mode, jobs are run immediately as they are sent
    to the batch system, in the sending thread, and the daddy thread is not
    run. But the queues are still used.
    """
    @classmethod
    def supportsAutoDeployment(cls):
        return False

    @classmethod
    def supportsWorkerCleanup(cls):
        return True

    numCores = cpu_count()

    minCores = 0.1
    """
    The minimal fractional CPU. Tasks with a smaller core requirement will be rounded up to this
    value.
    """
    physicalMemory = toil.physicalMemory()

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        self.config = config
        # Limit to the smaller of the user-imposed limit and what we actually
        # have on this machine for each resource.
        #
        # If we don't have up to the limit of the resource (and the resource
        # isn't the inlimited sentinel), warn.
        if maxCores > self.numCores:
            if maxCores != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough cores! User limited to %i but we only have %i.',
                    maxCores, self.numCores)
            maxCores = self.numCores
        if maxMemory > self.physicalMemory:
            if maxMemory != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough memory! User limited to %i bytes but we only have %i bytes.',
                    maxMemory, self.physicalMemory)
            maxMemory = self.physicalMemory

        workdir = Toil.getLocalWorkflowDir(
            config.workflowID, config.workDir
        )  # config.workDir may be None; this sets a real directory
        self.physicalDisk = toil.physicalDisk(workdir)
        if maxDisk > self.physicalDisk:
            if maxDisk != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough disk space! User limited to %i bytes but we only have %i bytes.',
                    maxDisk, self.physicalDisk)
            maxDisk = self.physicalDisk

        super(SingleMachineBatchSystem, self).__init__(config, maxCores,
                                                       maxMemory, maxDisk)
        assert self.maxCores >= self.minCores
        assert self.maxMemory >= 1

        # The scale allows the user to apply a factor to each task's cores requirement, thereby
        # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores
        # (scale > 1).
        self.scale = config.scale

        if config.badWorker > 0 and config.debugWorker:
            # We can't throw SIGUSR1 at the worker because it is also going to
            # be the leader and/or test harness.
            raise RuntimeError(
                "Cannot use badWorker and debugWorker together; "
                "worker would have to kill the leader")

        self.debugWorker = config.debugWorker

        # A counter to generate job IDs and a lock to guard it
        self.jobIndex = 0
        self.jobIndexLock = Lock()

        # A dictionary mapping IDs of submitted jobs to the command line
        self.jobs: Dict[str, toil.job.JobDescription] = {}

        # A queue of jobs waiting to be executed. Consumed by the daddy thread.
        self.inputQueue = Queue()

        # A queue of finished jobs. Produced by the daddy thread.
        self.outputQueue = Queue()

        # A dictionary mapping IDs of currently running jobs to their Info objects
        self.runningJobs: Dict[str, Info] = {}

        # These next two are only used outside debug-worker mode

        # A dict mapping PIDs to Popen objects for running jobs.
        # Jobs that don't fork are executed one at a time in the main thread.
        self.children: Dict[int, subprocess.Popen] = {}
        # A dict mapping child PIDs to the Job IDs they are supposed to be running.
        self.childToJob: Dict[int, str] = {}

        # A pool representing available CPU in units of minCores
        self.coreFractions = ResourcePool(int(self.maxCores / self.minCores),
                                          'cores')
        # A pool representing available memory in bytes
        self.memory = ResourcePool(self.maxMemory, 'memory')
        # A pool representing the available space in bytes
        self.disk = ResourcePool(self.maxDisk, 'disk')

        # If we can't schedule something, we fill this in with a reason why
        self.schedulingStatusMessage = None

        # We use this event to signal shutdown
        self.shuttingDown = Event()

        # A thread in charge of managing all our child processes.
        # Also takes care of resource accounting.
        self.daddyThread = None
        # If it breaks it will fill this in
        self.daddyException: Optional[Exception] = None

        if self.debugWorker:
            log.debug('Started batch system %s in worker debug mode.',
                      id(self))
        else:
            self.daddyThread = Thread(target=self.daddy, daemon=True)
            self.daddyThread.start()
            log.debug('Started batch system %s in normal mode.', id(self))

    def daddy(self):
        """
        Be the "daddy" thread.

        Our job is to look at jobs from the input queue.

        If a job fits in the available resources, we allocate resources for it
        and kick off a child process.

        We also check on our children.

        When a child finishes, we reap it, release its resources, and put its
        information in the output queue.
        """

        try:
            log.debug('Started daddy thread for batch system %s.', id(self))

            while not self.shuttingDown.is_set():
                # Main loop

                while not self.shuttingDown.is_set():
                    # Try to start as many jobs as we can try to start
                    try:
                        # Grab something from the input queue if available.
                        args = self.inputQueue.get_nowait()
                        jobCommand, jobID, jobCores, jobMemory, jobDisk, environment = args

                        coreFractions = int(jobCores / self.minCores)

                        # Try to start the child
                        result = self._startChild(jobCommand, jobID,
                                                  coreFractions, jobMemory,
                                                  jobDisk, environment)

                        if result is None:
                            # We did not get the resources to run this job.
                            # Requeue last, so we can look at the next job.
                            # TODO: Have some kind of condition the job can wait on,
                            # but without threads (queues for jobs needing
                            # cores/memory/disk individually)?
                            self.inputQueue.put(args)
                            break

                        # Otherwise it's a PID if it succeeded, or False if it couldn't
                        # start. But we don't care either way here.

                    except Empty:
                        # Nothing to run. Stop looking in the queue.
                        break

                # Now check on our children.
                for done_pid in self._pollForDoneChildrenIn(self.children):
                    # A child has actually finished.
                    # Clean up after it.
                    self._handleChild(done_pid)

                # Then loop again: start and collect more jobs.
                # TODO: It would be good to be able to wait on a new job or a finished child, whichever comes first.
                # For now we just sleep and loop.
                time.sleep(0.01)

            # When we get here, we are shutting down.
            log.debug(
                'Daddy thread cleaning up %d remaining children for batch system %s...',
                len(self.children), id(self))

            self._stop_and_wait(self.children.values())

            log.debug(
                'Daddy thread for batch system %s finishing because no children should now exist',
                id(self))

            # Then exit the thread.
            return
        except Exception as e:
            log.critical(
                'Unhandled exception in daddy thread for batch system %s: %s',
                id(self), traceback.format_exc())
            # Pass the exception back to the main thread so it can stop the next person who calls into us.
            self.daddyException = e
            raise

    def _checkOnDaddy(self):
        if self.daddyException is not None:
            # The daddy thread broke and we cannot do our job
            log.critical(
                'Propagating unhandled exception in daddy thread to main thread'
            )
            exc = self.daddyException
            self.daddyException = None
            if isinstance(exc, Exception):
                raise exc
            else:
                raise TypeError(
                    f'Daddy thread failed with non-exception: {exc}')

    def _stop_now(self, popens: Sequence[subprocess.Popen]) -> List[int]:
        """
        Stop the given child processes and all their children. Does not reap them.

        Returns a list of PGIDs killed, where processes may exist that have not
        yet received their kill signals.
        """

        # We will potentially need to poll these PGIDs to ensure that all
        # processes in them are gone.
        pgids = []

        for popen in popens:
            # Kill all the children

            if popen.returncode is None:
                # Process is not known to be dead. Try and grab its group.
                try:
                    pgid = os.getpgid(popen.pid)
                except OSError:
                    # It just died. Assume the pgid was its PID.
                    pgid = popen.pid
            else:
                # It is dead. Try it's PID as a PGID and hope we didn't re-use it.
                pgid = popen.pid

            if pgid != os.getpgrp():
                # The child process really is in its own group, and not ours.

                # Kill the group, which hopefully hasn't been reused
                log.debug(
                    'Send shutdown kill to process group %s known to batch system %s',
                    pgid, id(self))
                try:
                    os.killpg(pgid, signal.SIGKILL)
                    pgids.append(pgid)
                except ProcessLookupError:
                    # It is dead already
                    pass
                except PermissionError:
                    # It isn't ours actually. Ours is dead.
                    pass
            else:
                # Kill the subprocess again through popen in case it somehow
                # never managed to make the group.
                popen.kill()

        return pgids

    def _stop_and_wait(self, popens: Sequence[subprocess.Popen]) -> None:
        """
        Stop the given child processes and all their children. Blocks until the
        processes are gone.
        """

        pgids = self._stop_now(popens)

        for popen in popens:
            # Wait on all the children
            popen.wait()

            log.debug(
                'Process %s known to batch system %s is stopped; it returned %s',
                popen.pid, id(self), popen.returncode)

        for pgid in pgids:
            try:
                while True:
                    # Send a kill to the group again, to see if anything in it
                    # is still alive. Our first kill might not have been
                    # delivered yet.
                    os.killpg(pgid, signal.SIGKILL)
                    # If that worked it is still alive, so wait for the kernel
                    # to stop fooling around and kill it.
                    log.warning(
                        'Sent redundant shutdown kill to surviving process group %s known to batch system %s',
                        pgid, id(self))
                    time.sleep(0.1)
            except ProcessLookupError:
                # The group is actually gone now.
                pass
            except PermissionError:
                # The group is not only gone but reused
                pass

    def _pollForDoneChildrenIn(self, pid_to_popen):
        """
        See if any children represented in the given dict from PID to Popen
        object have finished.

        Return a collection of their PIDs.

        Guarantees that each child's exit code will be gettable via wait() on
        the child's Popen object (i.e. does not reap the child, unless via
        Popen).
        """

        # We keep our found PIDs in a set so we can work around waitid showing
        # us the same one repeatedly.
        ready = set()

        # Find the waitid function
        waitid = getattr(os, 'waitid', None)

        if callable(waitid):
            # waitid exists (not Mac)

            while True:
                # Poll for any child to have exit, but don't reap it. Leave reaping
                # to the Popen.
                # TODO: What if someone else in Toil wants to do this syscall?
                # TODO: Is this one-notification-per-done-child with WNOHANG? Or
                # can we miss some? Or do we see the same one repeatedly until it
                # is reaped?
                try:
                    siginfo = waitid(os.P_ALL, -1,
                                     os.WEXITED | os.WNOWAIT | os.WNOHANG)
                except ChildProcessError:
                    # This happens when there is nothing to wait on right now,
                    # instead of the weird C behavior of overwriting a field in
                    # a pointed-to struct.
                    siginfo = None
                if siginfo is not None and siginfo.si_pid in pid_to_popen and siginfo.si_pid not in ready:
                    # Something new finished
                    ready.add(siginfo.si_pid)
                else:
                    # Nothing we own that we haven't seen before has finished.
                    return ready
        else:
            # On Mac there's no waitid and no way to wait and not reap.
            # Fall back on polling all the Popen objects.
            # To make this vaguely efficient we have to return done children in
            # batches.
            for pid, popen in pid_to_popen.items():
                if popen.poll() is not None:
                    # Process is done
                    ready.add(pid)
                    log.debug('Child %d has stopped', pid)

            # Return all the done processes we found
            return ready

    def _runDebugJob(self, jobCommand, jobID, environment):
        """
        Run the jobCommand right now, in the current thread.
        May only be called in debug-worker mode.
        Assumes resources are available.
        """
        assert self.debugWorker
        # TODO: It is not possible to kill running jobs in forkless mode,
        # because they are run immediately in the main thread.
        info = Info(time.time(), None, None, killIntended=False)
        self.runningJobs[jobID] = info

        if jobCommand.startswith("_toil_worker "):
            # We can actually run in this thread
            jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
                1:4]  # Parse command
            jobStore = Toil.resumeJobStore(jobStoreLocator)
            toil_worker.workerScript(
                jobStore,
                jobStore.config,
                jobName,
                jobStoreID,
                redirectOutputToLogFile=not self.debugWorker
            )  # Call the worker
        else:
            # Run synchronously. If starting or running the command fails, let the exception stop us.
            subprocess.check_call(jobCommand,
                                  shell=True,
                                  env=dict(os.environ, **environment))

        self.runningJobs.pop(jobID)
        if not info.killIntended:
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=0,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

    def getSchedulingStatusMessage(self):
        # Implement the abstractBatchSystem's scheduling status message API
        return self.schedulingStatusMessage

    def _setSchedulingStatusMessage(self, message):
        """
        If we can't run a job, we record a short message about why not. If the
        leader wants to know what is up with us (for example, to diagnose a
        deadlock), it can ask us for the message.
        """

        self.schedulingStatusMessage = message

    def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk,
                    environment):
        """
        Start a child process for the given job.

        Allocate its required resources and save it and save it in our bookkeeping structures.

        If the job is started, returns its PID.
        If the job fails to start, reports it as failed and returns False.
        If the job cannot get the resources it needs to start, returns None.
        """

        # We fill this in if we manage to actually start the child.
        popen = None

        # This is when we started working on the job.
        startTime = time.time()

        # See if we can fit the job in our resource pools right now.
        if self.coreFractions.acquireNow(coreFractions):
            # We got some cores
            if self.memory.acquireNow(jobMemory):
                # We got some memory
                if self.disk.acquireNow(jobDisk):
                    # We got the final resource, disk.
                    # Actually run the job.
                    # When it finishes we will release what it was using.
                    # So it is important to not lose track of the child process.

                    try:
                        # Launch the job.
                        # Make sure it is in its own session (and thus its own
                        # process group) so that, if the user signals the
                        # workflow, Toil will be responsible for killing the
                        # job. This also makes sure that we can signal the job
                        # and all its children together. We assume that the
                        # process group ID will equal the PID of the process we
                        # are starting.
                        popen = subprocess.Popen(jobCommand,
                                                 shell=True,
                                                 env=dict(
                                                     os.environ,
                                                     **environment),
                                                 start_new_session=True)
                    except Exception:
                        # If the job can't start, make sure we release resources now
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        log.error('Could not start job %s: %s', jobID,
                                  traceback.format_exc())

                        # Report as failed.
                        self.outputQueue.put(
                            UpdatedBatchJobInfo(
                                jobID=jobID,
                                exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE,
                                wallTime=0,
                                exitReason=None))

                        # Free resources
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        # Complain it broke.
                        return False
                    else:
                        # If the job did start, record it
                        self.children[popen.pid] = popen
                        # Make sure we can look it up by PID later
                        self.childToJob[popen.pid] = jobID
                        # Record that the job is running, and the resources it is using
                        info = Info(startTime,
                                    popen, (coreFractions, jobMemory, jobDisk),
                                    killIntended=False)
                        self.runningJobs[jobID] = info

                        log.debug('Launched job %s as child %d', jobID,
                                  popen.pid)

                        # Report success starting the job
                        # Note that if a PID were somehow 0 it would look like False
                        assert popen.pid != 0
                        return popen.pid
                else:
                    # We can't get disk, so free cores and memory
                    self.coreFractions.release(coreFractions)
                    self.memory.release(jobMemory)
                    self._setSchedulingStatusMessage(
                        'Not enough disk to run job %s' % jobID)
            else:
                # Free cores, since we can't get memory
                self.coreFractions.release(coreFractions)
                self._setSchedulingStatusMessage(
                    'Not enough memory to run job %s' % jobID)
        else:
            self._setSchedulingStatusMessage('Not enough cores to run job %s' %
                                             jobID)

        # If we get here, we didn't succeed or fail starting the job.
        # We didn't manage to get the resources.
        # Report that.
        return None

    def _handleChild(self, pid: int) -> None:
        """
        Handle a child process PID that has finished.
        The PID must be for a child job we started.
        Not thread safe to run at the same time as we are making more children.

        Remove the child from our bookkeeping structures and free its resources.
        """

        # Look up the child
        popen = self.children[pid]
        jobID = self.childToJob[pid]
        info = self.runningJobs[jobID]

        # Unpack the job resources
        (coreFractions, jobMemory, jobDisk) = info.resources

        # Clean up our records of the job.
        self.runningJobs.pop(jobID)
        self.childToJob.pop(pid)
        self.children.pop(pid)

        if popen.returncode is None or not callable(getattr(
                os, 'waitid', None)):
            # It isn't reaped yet, or we have to reap all children to see if thay're done.
            # Before we reap it (if possible), kill its PID as a PGID to make sure
            # it isn't leaving children behind.
            # TODO: This is a PGID re-use risk on Mac because the process is
            # reaped already and the PGID may have been reused.
            try:
                os.killpg(pid, signal.SIGKILL)
            except ProcessLookupError:
                # It is dead already
                pass
            except PermissionError:
                # It isn't ours actually. Ours is dead.
                pass

        # See how the child did, and reap it.
        statusCode = popen.wait()
        if statusCode != 0 and not info.killIntended:
            log.error("Got exit code %i (indicating failure) "
                      "from job %s.", statusCode, self.jobs[jobID])
        if not info.killIntended:
            # Report if the job failed and we didn't kill it.
            # If we killed it then it shouldn't show up in the queue.
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=statusCode,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

        # Make absolutely sure all processes in the group have received their
        # kill signals and been cleaned up.
        # TODO: this opens a PGID reuse risk; we reaped the process and its
        # PGID may have been re-used. But it probably hasn't been and we
        # definitely want to make sure all its children died before saying the
        # job is done. Some might not be dead yet if we don't do this.
        # TODO: can we safely do this before reaping? Or would we sit forever
        # signaling a dead but unreaped process?
        try:
            while True:
                # Send a kill to the group again, to see if anything in it
                # is still alive. Our first kill might not have been
                # delivered yet.
                os.killpg(pid, signal.SIGKILL)
                # If that worked it is still alive, so wait for the kernel
                # to stop fooling around and kill it.
                log.warning(
                    'Sent redundant job completion kill to surviving process group %s known to batch system %s',
                    pid, id(self))
                time.sleep(0.1)
        except ProcessLookupError:
            # It is dead already
            pass
        except PermissionError:
            # It isn't ours actually. Ours is dead.
            pass

        # Free up the job's resources.
        self.coreFractions.release(coreFractions)
        self.memory.release(jobMemory)
        self.disk.release(jobDisk)

        log.debug('Child %d for job %s succeeded', pid, jobID)

    def issueBatchJob(self, jobDesc):
        """Adds the command and resources to a queue to be run."""

        self._checkOnDaddy()

        # Round cores to minCores and apply scale.
        # Make sure to give minCores even if asked for 0 cores, or negative or something.
        cores = max(
            math.ceil(jobDesc.cores * self.scale / self.minCores) *
            self.minCores, self.minCores)

        # Don't do our own assertions about job size vs. our configured size.
        # The abstract batch system can handle it.
        self.checkResourceRequest(jobDesc.memory,
                                  cores,
                                  jobDesc.disk,
                                  job_name=jobDesc.jobName,
                                  detail=f'Scale is set to {self.scale}.')
        log.debug(
            f"Issuing the command: {jobDesc.command} with "
            f"memory: {jobDesc.memory}, cores: {cores}, disk: {jobDesc.disk}")
        with self.jobIndexLock:
            jobID = self.jobIndex
            self.jobIndex += 1
        self.jobs[jobID] = jobDesc.command

        if self.debugWorker:
            # Run immediately, blocking for return.
            # Ignore resource requirements; we run one job at a time
            self._runDebugJob(jobDesc.command, jobID, self.environment.copy())
        else:
            # Queue the job for later
            self.inputQueue.put(
                (jobDesc.command, jobID, cores, jobDesc.memory, jobDesc.disk,
                 self.environment.copy()))

        return jobID

    def killBatchJobs(self, jobIDs: Sequence[str]) -> None:
        """Kills jobs by ID."""

        self._checkOnDaddy()

        log.debug('Killing jobs: {}'.format(jobIDs))

        # Collect the popen handles for the jobs we have to stop
        popens: List[subprocess.Popen] = []

        for jobID in jobIDs:
            if jobID in self.runningJobs:
                info = self.runningJobs[jobID]
                info.killIntended = True
                if info.popen is not None:
                    popens.append(info.popen)
                else:
                    # No popen if running in forkless mode currently
                    assert self.debugWorker
                    log.critical("Can't kill job: %s in debug mode" % jobID)

        # Stop them all in a batch. Don't reap, because we need the daddy
        # thread to reap them to mark the jobs as not running anymore.
        self._stop_now(popens)

        for jobID in jobIDs:
            while jobID in self.runningJobs:
                # Wait for the daddy thread to collect them.
                time.sleep(0.01)

    def getIssuedBatchJobIDs(self):
        """Just returns all the jobs that have been run, but not yet returned as updated."""

        self._checkOnDaddy()

        return list(self.jobs.keys())

    def getRunningBatchJobIDs(self):

        self._checkOnDaddy()

        now = time.time()
        return {
            jobID: now - info.time
            for jobID, info in list(self.runningJobs.items())
        }

    def shutdown(self):
        """
        Cleanly terminate and join daddy thread.
        """

        if self.daddyThread is not None:
            # Tell the daddy thread to stop.
            self.shuttingDown.set()
            # Wait for it to stop.
            self.daddyThread.join()

        BatchSystemSupport.workerCleanup(self.workerCleanupInfo)

    def getUpdatedBatchJob(self, maxWait):
        """Returns a tuple of a no-longer-running job, the return value of its process, and its runtime, or None."""

        self._checkOnDaddy()

        try:
            item = self.outputQueue.get(timeout=maxWait)
        except Empty:
            return None
        self.jobs.pop(item.jobID)
        log.debug("Ran jobID: %s with exit value: %i", item.jobID,
                  item.exitStatus)
        return item

    @classmethod
    def setOptions(cls, setOption):
        setOption("scale", default=1)
Exemplo n.º 28
0
 def testAvailableCores(self):
     self.assertTrue(cpu_count() >= numCores)
Exemplo n.º 29
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                        " must appear in NEWICK tree in <seqfile>) to use as a "
                        "root for the alignment.  Any genomes not below this node "
                        "in the tree may be used as outgroups but will never appear"
                        " in the output.  If no root is specifed then the root"
                        " of the tree is used. ", default=None, required=True)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--nonBlastInput", action="store_true",
                        help="Input does not come from cactus-blast: Do not append ids to fasta names")
    parser.add_argument("--nonBlastMegablockFilter", action="store_true",
                        help="By default, the megablock filter is off for --nonBlastInput, as it does not play"
                        "nicely with reference-based alignments.  This flag will turn it back on")
    parser.add_argument("--pafInput", action="store_true",
                        help="'blastOutput' input is in paf format, rather than lastz cigars.")    

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"
        
    options.database = 'kyoto_tycoon'

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
Exemplo n.º 30
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None and options.batchSystem != 'singleMachine':
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))