예제 #1
0
def test_adapter_none_found():
    """
    Test to verify that an Exception is raised when an non-existing adapter
    is requested from the factory
    """
    with pytest.raises(Exception):
        ScriptAdapterFactory.get_adapter('empty-adapter')
def test_local_adapter_in_factory():
    """
    Testing to makes sure that the LocalScriptAdapter has been registered
    correctly in the ScriptAdapterFactory.
    :return:
    """
    saf = ScriptAdapterFactory
    assert(saf.factories[LocalScriptAdapter.key] == LocalScriptAdapter)
    assert(LocalScriptAdapter.key in ScriptAdapterFactory.get_valid_adapters())
    assert(ScriptAdapterFactory.get_adapter(LocalScriptAdapter.key) ==
           LocalScriptAdapter)
예제 #3
0
def test_flux_adapter_in_factory():
    """
    Testing to makes sure that the FluxScriptAdapter has been registered
    correctly in the ScriptAdapterFactory.
    :return:
    """
    saf = ScriptAdapterFactory
    # Make sure FluxScriptAdapter is in the facotries object
    assert (saf.factories[FluxScriptAdapter.key] == FluxScriptAdapter)
    # Make sure the FluxScriptAdapter key is in the valid adapters
    assert (FluxScriptAdapter.key in ScriptAdapterFactory.get_valid_adapters())
    # Make sure that get_adapter returns the FluxScriptAdapter when asking
    # for it by key
    assert (ScriptAdapterFactory.get_adapter(
        FluxScriptAdapter.key) == FluxScriptAdapter)
예제 #4
0
    def generate_scripts(self):
        """
        Generate the scripts for all steps in the ExecutionGraph.

        The generate_scripts method scans the ExecutionGraph instance and uses
        the stored adapter to write executable scripts for either local or
        scheduled execution. If a restart command is specified, a restart
        script will be generated for that record.
        """
        # An adapter must be specified
        if not self._adapter:
            msg = "Adapter not found. Specify a ScriptAdapter using " \
                  "set_adapter."
            LOGGER.error(msg)
            raise ValueError(msg)

        # Set up the adapter.
        LOGGER.info("Generating scripts...")
        adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
        adapter = adapter(**self._adapter)

        self._check_tmp_dir()
        for key, record in self.values.items():
            if key == SOURCE:
                continue

            # Record generates its own script.
            record.setup_workspace()
            record.generate_script(adapter, self._tmp_dir)
예제 #5
0
def test_get_valid_adapters():
    """
    Test to verify that the keys in the internal factory is the same set as
    the resutls from get_valid_adapters()
    """
    saf = ScriptAdapterFactory
    assert (saf.factories.keys() == ScriptAdapterFactory.get_valid_adapters())
예제 #6
0
    def cancel_study(self):
        """Cancel the study."""
        joblist = []
        for step in self.in_progress:
            jobid = self.values[step].jobid[-1]
            joblist.append(jobid)

        # Grab the adapter from the ScriptAdapterFactory.
        adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
        adapter = adapter(**self._adapter)

        # cancel our jobs
        retcode = adapter.cancel_jobs(joblist)
        self.is_canceled = True

        if retcode == CancelCode.OK:
            logger.info("Successfully requested to cancel all jobs.")
            return retcode
        elif retcode == CancelCode.ERROR:
            logger.error("Failed to cancel jobs.")
            return retcode
        else:
            msg = "Unknown Error (Code = {retcode})".format(retcode)
            logger.error(msg)
            return retcode
예제 #7
0
    def generate_scripts(self):
        """
        Generates the scripts for all steps in the ExecutionGraph.

        The generate_scripts method scans the ExecutionGraph instance and uses
        the stored adapter to write executable scripts for either local or
        scheduled execution. If a restart command is specified, a restart
        script will be generated for that record.
        """
        # An adapter must be specified
        if not self._adapter:
            msg = "Adapter not found. Specify a ScriptAdapter using " \
                  "set_adapter."
            logger.error(msg)
            raise ValueError(msg)

        for key, record in self.values.items():
            if key == SOURCE:
                continue

            logger.info("Generating scripts...")
            adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
            adapter = adapter(**self._adapter)
            to_be_scheduled, cmd_script, restart_script = \
                adapter.write_script(record.workspace, record.step)
            logger.info("Step -- %s\nScript: %s\nRestart: %s\nScheduled?: %s",
                        record.step.name, cmd_script, restart_script,
                        to_be_scheduled)
            record.to_be_scheduled = to_be_scheduled
            record.script = cmd_script
            record.restart_script = restart_script
예제 #8
0
    def _execute(self, adapter, script):
        if self.to_be_scheduled:
            srecord = adapter.submit(self.step, script, self.workspace.value)
        else:
            self.mark_running()
            ladapter = ScriptAdapterFactory.get_adapter("local")()
            srecord = ladapter.submit(self.step, script, self.workspace.value)

        retcode = srecord.submission_code
        jobid = srecord.job_identifier
        return retcode, jobid
예제 #9
0
    def _execute(self, adapter, script):
        if self.to_be_scheduled:
            retcode, jobid = adapter.submit(self.step, script,
                                            self.workspace.value)
        else:
            self.mark_running()
            ladapter = ScriptAdapterFactory.get_adapter("local")()
            retcode, jobid = ladapter.submit(self.step, script,
                                             self.workspace.value)

        return retcode, jobid
예제 #10
0
    def check_study_status(self):
        """
        Check the status of currently executing steps in the graph.

        This method is used to check the status of all currently in progress
        steps in the ExecutionGraph. Each ExecutionGraph stores the adapter
        used to generate and execute its scripts.
        """
        # Set up the job list and the map to get back to step names.
        joblist = []
        jobmap = {}
        for step in self.in_progress:
            jobid = self.values[step].jobid[-1]
            joblist.append(jobid)
            jobmap[jobid] = step

        # Grab the adapter from the ScriptAdapterFactory.
        adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
        adapter = adapter(**self._adapter)
        # Use the adapter to grab the job statuses.
        retcode, job_status = adapter.check_jobs(joblist)
        # Map the job identifiers back to step names.
        step_status = {
            jobmap[jobid]: status
            for jobid, status in job_status.items()
        }

        # Based on return code, log something different.
        if retcode == JobStatusCode.OK:
            LOGGER.info("Jobs found for user '%s'.", getpass.getuser())
            return retcode, step_status
        elif retcode == JobStatusCode.NOJOBS:
            LOGGER.info("No jobs found.")
            return retcode, step_status
        else:
            msg = "Unknown Error (Code = {})".format(retcode)
            LOGGER.error(msg)
            return retcode, step_status
예제 #11
0
    def cancel_study(self):
        """Cancel the study."""
        joblist = []
        for step in self.in_progress:
            jobid = self.values[step].jobid[-1]
            joblist.append(jobid)

        # Grab the adapter from the ScriptAdapterFactory.
        adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
        adapter = adapter(**self._adapter)

        # cancel our jobs
        crecord = adapter.cancel_jobs(joblist)
        self.is_canceled = True

        if crecord.cancel_status == CancelCode.OK:
            LOGGER.info("Successfully requested to cancel all jobs.")
        elif crecord.cancel_status == CancelCode.ERROR:
            LOGGER.error("Failed to cancel jobs. (Code = %s)",
                         crecord.return_code)
        else:
            LOGGER.error("Unknown Error (Code = %s)", crecord.return_code)

        return crecord.cancel_status
예제 #12
0
    def set_adapter(self, adapter):
        """
        Set the adapter used to interface for scheduling tasks.

        :param adapter: Adapter name to be used when launching the graph.
        """
        if not adapter:
            # If we have no adapter specified, assume sequential execution.
            self._adapter = None
            return

        if not isinstance(adapter, dict):
            msg = "Adapter settings must be contained in a dictionary."
            LOGGER.error(msg)
            raise TypeError(msg)

        # Check to see that the adapter type is something the
        if adapter["type"] not in ScriptAdapterFactory.get_valid_adapters():
            msg = "'{}' adapter must be specfied in ScriptAdapterFactory." \
                  .format(adapter)
            LOGGER.error(msg)
            raise TypeError(msg)

        self._adapter = adapter
예제 #13
0
    def execute_ready_steps(self):
        """
        Execute any steps whose dependencies are satisfied.

        The 'execute_ready_steps' method is the core of how the ExecutionGraph
        manages execution. This method does the following:
            - Checks the status of existing jobs that are executing.
                - Updates the state if changed.
            - Finds steps that are initialized and determines what can be run:
                - Scans a steps dependencies and stages if all are me.
                - Executes any steps whose dependencies are met.

        :returns: True if the study has completed, False otherwise.
        """
        # TODO: We may want to move this to a singleton somewhere
        # so we can guarantee that all steps use the same adapter.
        adapter = ScriptAdapterFactory.get_adapter(self._adapter["type"])
        adapter = adapter(**self._adapter)

        if not self.dry_run:
            LOGGER.debug("Checking status check...")
            retcode, job_status = self.check_study_status()
        else:
            LOGGER.debug("DRYRUN: Skipping status check...")
            retcode = JobStatusCode.OK
            job_status = {}

        LOGGER.debug("Checked status (retcode %s)-- %s", retcode, job_status)

        # For now, if we can't check the status something is wrong.
        # Don't modify the DAG.
        if retcode == JobStatusCode.ERROR:
            msg = "Job status check failed -- Aborting."
            LOGGER.error(msg)
            raise RuntimeError(msg)
        elif retcode == JobStatusCode.OK:
            # For the status of each currently in progress job, check its
            # state.
            cleanup_steps = set()  # Steps that are in progress showing failed.

            for name, status in job_status.items():
                LOGGER.debug("Checking job '%s' with status %s.", name, status)
                record = self.values[name]

                if status == State.FINISHED:
                    # Mark the step complete and notate its end time.
                    record.mark_end(State.FINISHED)
                    LOGGER.info(
                        "Step '%s' marked as finished. Adding to "
                        "complete set.", name)
                    self.completed_steps.add(name)
                    self.in_progress.remove(name)

                elif status == State.RUNNING:
                    # When detect that a step is running, mark it.
                    LOGGER.info("Step '%s' found to be running.")
                    record.mark_running()

                elif status == State.TIMEDOUT:
                    # Execute the restart script.
                    # If a restart script doesn't exist, re-run the command.
                    # If we're under the restart limit, attempt a restart.
                    if record.can_restart:
                        if record.mark_restart():
                            LOGGER.info(
                                "Step '%s' timed out. Restarting (%s of %s).",
                                name, record.restarts, record.restart_limit)
                            self._execute_record(record, adapter, restart=True)
                        else:
                            LOGGER.info(
                                "'%s' has been restarted %s of %s "
                                "times. Marking step and all "
                                "descendents as failed.", name,
                                record.restarts, record.restart_limit)
                            self.in_progress.remove(name)
                            cleanup_steps.update(self.bfs_subtree(name)[0])
                    # Otherwise, we can't restart so mark the step timed out.
                    else:
                        LOGGER.info(
                            "'%s' timed out, but cannot be restarted."
                            " Marked as TIMEDOUT.", name)
                        # Mark that the step ended due to TIMEOUT.
                        record.mark_end(State.TIMEDOUT)
                        # Remove from in progress since it no longer is.
                        self.in_progress.remove(name)
                        # Add the subtree to the clean up steps
                        cleanup_steps.update(self.bfs_subtree(name)[0])
                        # Remove the current step, clean up is used to mark
                        # steps definitively as failed.
                        cleanup_steps.remove(name)
                        # Add the current step to failed.
                        self.failed_steps.add(name)

                elif status == State.HWFAILURE:
                    # TODO: Need to make sure that we do this a finite number
                    # of times.
                    # Resubmit the cmd.
                    LOGGER.warning(
                        "Hardware failure detected. Attempting to "
                        "resubmit step '%s'.", name)
                    # We can just let the logic below handle submission with
                    # everything else.
                    self.ready_steps.append(name)

                elif status == State.FAILED:
                    LOGGER.warning(
                        "Job failure reported. Aborting %s -- flagging all "
                        "dependent jobs as failed.", name)
                    self.in_progress.remove(name)
                    record.mark_end(State.FAILED)
                    cleanup_steps.update(self.bfs_subtree(name)[0])

                elif status == State.UNKNOWN:
                    record.mark_end(State.UNKNOWN)
                    LOGGER.info(
                        "Step '%s' found in UNKNOWN state. Step was found "
                        "in '%s' state previously, marking as UNKNOWN. "
                        "Adding to failed steps.", name, record.status)
                    cleanup_steps.update(self.bfs_subtree(name)[0])
                    self.in_progress.remove(name)

                elif status == State.CANCELLED:
                    LOGGER.info("Step '%s' was cancelled.", name)
                    self.in_progress.remove(name)
                    record.mark_end(State.CANCELLED)

            # Let's handle all the failed steps in one go.
            for node in cleanup_steps:
                self.failed_steps.add(node)
                self.values[node].mark_end(State.FAILED)

        # Now that we've checked the statuses of existing jobs we need to make
        # sure dependencies haven't been met.
        for key in self.values.keys():
            # We MUST dereference from the key. If we use values.items(), a
            # generator gets produced which will give us a COPY of a record and
            # not the actual record.
            record = self.values[key]

            # A completed step by definition has had its dependencies met.
            # Skip it.
            if key in self.completed_steps:
                LOGGER.debug("'%s' in completed set, skipping.", key)
                continue

            LOGGER.debug("Checking %s -- %s", key, record.jobid)
            # If the record is only INITIALIZED, we have encountered a step
            # that needs consideration.
            if record.status == State.INITIALIZED:
                LOGGER.debug(
                    "'%s' found to be initialized. Checking "
                    "dependencies. ", key)

                LOGGER.debug("Unfulfilled dependencies: %s",
                             self._dependencies[key])

                s_completed = filter(lambda x: x in self.completed_steps,
                                     self._dependencies[key])
                self._dependencies[key] = \
                    self._dependencies[key] - set(s_completed)
                LOGGER.debug(
                    "Completed dependencies: %s\n"
                    "Remaining dependencies: %s", s_completed,
                    self._dependencies[key])

                # If the gating dependencies set is empty, we can execute.
                if not self._dependencies[key]:
                    if key not in self.ready_steps:
                        LOGGER.debug("All dependencies completed. Staging.")
                        self.ready_steps.append(key)
                    else:
                        LOGGER.debug("Already staged. Passing.")
                        continue

        # We now have a collection of ready steps. Execute.
        # If we don't have a submission limit, go ahead and submit all.
        if self._submission_throttle == 0:
            LOGGER.info("Launching all ready steps...")
            _available = len(self.ready_steps)
        # Else, we have a limit -- adhere to it.
        else:
            # Compute the number of available slots we have for execution.
            _available = self._submission_throttle - len(self.in_progress)
            # Available slots should never be negative, but on the off chance
            # we are in a slot deficit, then we will just say none are free.
            _available = max(0, _available)
            # Now, we need to take the min of the length of the queue and the
            # computed number of slots. We could have free slots, but have less
            # in the queue.
            _available = min(_available, len(self.ready_steps))
            LOGGER.info("Found %d available slots...", _available)

        for i in range(0, _available):
            # Pop the record and execute using the helper method.
            _record = self.values[self.ready_steps.popleft()]

            # If we get to this point and we've cancelled, cancel the record.
            if self.is_canceled:
                LOGGER.info("Cancelling '%s' -- continuing.", _record.name)
                _record.mark_end(State.CANCELLED)
                self.cancelled_steps.add(_record.name)
                continue

            LOGGER.debug("Launching job %d -- %s", i, _record.name)
            self._execute_record(_record, adapter)

        # check the status of the study upon finishing this round of execution
        completion_status = self._check_study_completion()
        return completion_status
예제 #14
0
    def _execute_record(self, name, record, restart=False):
        """
        Execute a StepRecord.

        :param name: The name of the step to be executed.
        :param record: An instance of a _StepRecord class.
        :param restart: True if the record needs restarting, False otherwise.
        """
        num_restarts = 0    # Times this step has temporally restarted.
        retcode = None      # Execution return code.

        # If we want to schedule the execution of the record, grab the
        # scheduler adapter from the ScriptAdapterFactory.
        if record.to_be_scheduled:
            adapter = \
                ScriptAdapterFactory.get_adapter(self._adapter["type"])
        else:
            # Otherwise, just use the local adapter.
            adapter = \
                ScriptAdapterFactory.get_adapter("local")

        # Pass the adapter the settings we've stored.
        adapter = adapter(**self._adapter)

        # While our submission needs to be submitted, keep trying:
        # 1. If the JobStatus is not OK.
        # 2. num_restarts is less than self._submission_attempts
        while retcode != SubmissionCode.OK and \
                num_restarts < self._submission_attempts:
            logger.info("Attempting submission of '%s' (attempt %d of %d)...",
                        name, num_restarts + 1, self._submission_attempts)

            # If not a restart, submit the cmd script.
            if not restart:
                retcode, jobid = adapter.submit(
                    record.step,
                    record.script,
                    record.workspace)
            # Otherwise, it's a restart.
            else:
                # If the restart is specified, use the record restart script.
                retcode, jobid = adapter.submit(
                    record.step,
                    record.restart_script,
                    record.workspace)

            # Increment the number of restarts we've attempted.
            num_restarts += 1

        if retcode == SubmissionCode.OK:
            logger.info("'%s' submitted with identifier '%s'", name, jobid)
            record.status = State.PENDING
            record.jobid.append(jobid)
            self.in_progress.add(name)

            # Executed locally, so if we executed OK -- Finished.
            if record.to_be_scheduled is False:
                self.completed_steps.add(name)
                self.in_progress.remove(name)
                record.state = State.FINISHED
        else:
            # Find the subtree, because anything dependent on this step now
            # failed.
            logger.warning("'%s' failed to properly submit properly. "
                           "Step failed.", name)
            path, parent = self.bfs_subtree(name)
            for node in path:
                self.failed_steps.add(node)
                self.values[node].status = State.FAILED