Exemplo n.º 1
0
def main(args):
    level = (logging.WARN, logging.INFO, logging.DEBUG)[min(args.vb, 2)]
    g_log.setLevel(level)

    g_log.info("Here we go")

    if args.start_proxy:
        cfg = 'db: test\ncollection: kblog\n'
        with open(ProxyArgs.conf, "w") as f:
            f.write(cfg)
        g_log.info("Starting log proxy with config:\n{}".format(cfg))
        pid = os.fork()
        if pid == 0:
            ProxyArgs.vb = args.vb
            proxy.main(ProxyArgs)
        else:
            time.sleep(2)
            reset_handlers()
            g_log.info("Sending log message")
            kblog = get_logger("test")
            kblog.info("{}{}{}".format(args.event, proxy.EVENT_MSG_SEP,
                                       args.message))
            time.sleep(1)
            g_log.debug("Killing {:d}".format(pid))
            os.kill(pid, signal.SIGKILL)
            g_log.info("Waiting for {:d} to stop".format(pid))
            os.waitpid(pid, 0)

    return 0
Exemplo n.º 2
0
    def test_buffering(self):
        # create logger and send messages
        kblog = kblogging.get_logger("test")
        kblog.info("hello")
        kblog.info("world")

        self.start_receiver()

        # wait for the poll
        time.sleep(self.poll_sec * 4)

        # fetch whatever the receiver got
        self.stop_receiver(kblog)
Exemplo n.º 3
0
    def test_buffering(self):
        # create logger and send messages
        kblog = kblogging.get_logger("test")
        kblog.info("hello")
        kblog.info("world")

        self.start_receiver()

        # wait for the poll
        time.sleep(self.poll_sec * 4)

        # fetch whatever the receiver got
        data = self.recv.get_data()

        self.stop_receiver(kblog)

        # check that receiver got the (buffered) messages
        self.assertEqual(data, "helloworld")
Exemplo n.º 4
0
    def test_buffering(self):
        # create logger and send messages
        kblog = kblogging.get_logger("test")
        kblog.info("hello")
        kblog.info("world")

        self.start_receiver()

        # wait for the poll
        time.sleep(self.poll_sec * 4)

        # fetch whatever the receiver got
        data = self.recv.get_data()

        self.stop_receiver(kblog)

        # check that receiver got the (buffered) messages
        self.assertEqual(data, "helloworld")
Exemplo n.º 5
0
    def __init__(self, name, debug=False):
        """Create a Python logging.Logger with the given name, under the existing
        IPython logging framework.

        :param name: Name of logger
        :type name: str
        :param debug: Whether to set debug as the log level
        :type debug: bool
        """
        self._name = name
        # use the IPython application singleton's 'log' trait
        # self._log = Application.instance().log
        self._log = kblogging.get_logger(name)
        if debug:
            self._log.setLevel(logging.DEBUG)
        else:
            self._log.setLevel(logging.INFO)
        self._is_debug = debug
        self._start_time = None
Exemplo n.º 6
0
    def __init__(self, name, debug=False):
        """Create a Python logging.Logger with the given name, under the existing
        IPython logging framework.

        :param name: Name of logger
        :type name: str
        :param debug: Whether to set debug as the log level
        :type debug: bool
        """
        self._name = name
        # use the IPython application singleton's 'log' trait
        # self._log = Application.instance().log
        self._log = kblogging.get_logger(name)
        if debug:
            self._log.setLevel(logging.DEBUG)
        else:
            self._log.setLevel(logging.INFO)
        self._is_debug = debug
        self._start_time = None
Exemplo n.º 7
0
class JobComm:
    """
    The main JobComm channel. This is the kernel-side of the connection, and routes
    requests for job information from various app cells (or the front end in general)
    to the right function.

    This has a handle on the JobManager, which does the work of fetching job information
    and statuses.

    The JobComm officially exposes the channel for other things to use. Anything that
    needs to send messages about Jobs to the front end should use JobComm.send_comm_message.

    It also maintains the lookup loop thread. This is a threading.Timer that, after
    some interval, will lookup the status of all running jobs. If there are no jobs to
    look up, this cancels itself.

    Allowed messages:
    * all_status - return job state for all jobs in this Narrative.
    * job_status - return the job state for a single job (requires a job_id)
    * job_info - return basic job info for a single job (requires a job_id)
    * start_update_loop - starts a looping thread that runs returns all job info
        for running jobs
    * stop_update_loop - stops the automatic update loop
    * start_job_update - tells the update loop to include a job when updating (requires a job_id)
    * stop_job_update - has the update loop not include a job when updating (requires a job_id)
    * cancel_job - cancels a running job, if it hasn't otherwise terminated (requires a job_id)
    * job_logs - sends job logs back over the comm channel (requires a job id and first line)
    * job_logs_latest - sends the most recent job logs over the comm channel (requires a job_id)
    """

    # An instance of this class. It's meant to be a singleton, so this just gets created and
    # returned once.
    __instance = None

    # The kernel job comm channel that talks to the front end.
    _comm = None

    # The JobManager that actually manages things.
    _jm = None

    _msg_map = None
    _running_lookup_loop = False
    _lookup_timer = None
    _log = kblogging.get_logger(__name__)

    def __new__(cls):
        if JobComm.__instance is None:
            JobComm.__instance = object.__new__(cls)
        return JobComm.__instance

    def __init__(self):
        if self._comm is None:
            self._comm = Comm(target_name="KBaseJobs", data={})
            self._comm.on_msg(self._handle_comm_message)
        if self._jm is None:
            self._jm = jobmanager.JobManager()
        if self._msg_map is None:
            self._msg_map = {
                "all_status": self._lookup_all_job_states,
                "job_status": self._lookup_job_state,
                "job_info": self._lookup_job_info,
                "start_update_loop": self.start_job_status_loop,
                "stop_update_loop": self.stop_job_status_loop,
                "start_job_update": self._modify_job_update,
                "stop_job_update": self._modify_job_update,
                "cancel_job": self._cancel_job,
                "job_logs": self._get_job_logs,
                "job_logs_latest": self._get_job_logs
            }

    def _verify_job_id(self, req: JobRequest) -> None:
        if req.job_id is None:
            self.send_error_message("job_does_not_exist", req)
            raise ValueError(
                f"Job id required to process {req.request} request")

    def start_job_status_loop(self, *args, **kwargs) -> None:
        """
        Starts the job status lookup loop. This runs every 10 seconds.
        This has the bare *args and **kwargs to handle the case where this comes in as a job
        channel request (gets a JobRequest arg), or has the "init_jobs" kwarg.

        If init_jobs=True, this attempts to reinitialize the JobManager's list of known jobs
        from the workspace.
        """
        self._running_lookup_loop = True
        if kwargs.get("init_jobs", False):
            try:
                self._jm.initialize_jobs()
            except Exception as e:
                error = {
                    "error": "Unable to get initial jobs list",
                    "message": getattr(e, "message", "Unknown reason"),
                    "code": getattr(e, "code", -1),
                    "source": getattr(e, "source", "jobmanager"),
                    "name": getattr(e, "name",
                                    type(e).__name__),
                    "service": "execution_engine2"
                }
                self.send_comm_message("job_init_err", error)
        if self._lookup_timer is None:
            self._lookup_job_status_loop()

    def stop_job_status_loop(self, *args, **kwargs) -> None:
        """
        Stops the job status lookup loop if it's running. Otherwise, this effectively
        does nothing.
        """
        if self._lookup_timer:
            self._lookup_timer.cancel()
            self._lookup_timer = None
        self._running_lookup_loop = False

    def _lookup_job_status_loop(self) -> None:
        """
        Run a loop that will look up job info. After running, this spawns a Timer thread on a 10
        second loop to run itself again.
        """
        job_statuses = self._lookup_all_job_states(None)
        if len(job_statuses) == 0 or not self._running_lookup_loop:
            self.stop_job_status_loop()
        else:
            self._lookup_timer = threading.Timer(10,
                                                 self._lookup_job_status_loop)
            self._lookup_timer.start()

    def _lookup_all_job_states(self, req: JobRequest) -> dict:
        """
        Fetches status of all jobs in the current workspace and sends them to the front end.
        req can be None, as it's not used.
        """
        job_statuses = self._jm.lookup_all_job_states(ignore_refresh_flag=True)
        self.send_comm_message("job_status_all", job_statuses)
        return job_statuses

    def _lookup_job_info(self, req: JobRequest) -> dict:
        """
        Looks up job info. This is just some high-level generic information about the running
        job, including the app id, name, and job parameters.
        :param req: a JobRequest with the job_id of interest
        :returns: a dict with the following keys:
            - app_id - str - module/name,
            - app_name - str - name of the app as it shows up in the Narrative interface
            - job_id - str - just re-reporting the id string
            - job_params - dict - the params that were passed to that particular job
        """
        self._verify_job_id(req)
        try:
            job_info = self._jm.lookup_job_info(req.job_id)
            self.send_comm_message("job_info", job_info)
            return job_info
        except ValueError as e:
            self.send_error_message("job_does_not_exist", req)
            raise

    def lookup_job_state(self, job_id: str) -> dict:
        """
        This differs from the _lookup_job_state (underscored version) in that
        it just takes a job_id string, not a JobRequest. It, however, functions the
        same, by creating a JobRequest and forwarding it to the request version.

        Therefore, it sends the job message to the browser over the right channel,
        and also returns the job state (or raises a ValueError if not found).
        """
        req = JobRequest({
            "content": {
                "data": {
                    "request_type": "job_status",
                    "job_id": job_id
                }
            }
        })
        return self._lookup_job_state(req)

    def _lookup_job_state(self, req: JobRequest) -> dict:
        """
        Look up job state.
        """
        self._verify_job_id(req)
        try:
            job_state = self._jm.get_job_state(req.job_id)
            self.send_comm_message("job_status", job_state)
            return job_state
        except ValueError as e:
            # kblogging.log_event(self._log, "lookup_job_state_error", {"err": str(e)})
            self.send_error_message("job_does_not_exist", req)
            raise

    def _modify_job_update(self, req: JobRequest) -> None:
        """
        Modifies how many things want to listen to a job update.
        If this is a request to start a job update, then this starts the update loop that
        returns update messages across the job channel.
        If this is a request to stop a job update, then this sends that request to the
        JobManager, which might have the side effect of shutting down the update loop if there's
        no longer anything requesting job status.

        If the given job_id in the request doesn't exist in the current Narrative, or is None,
        this raises a ValueError.
        """
        self._verify_job_id(req)
        update_adjust = 1 if req.request == "start_job_update" else -1
        self._jm.modify_job_refresh(req.job_id, update_adjust)
        if update_adjust == 1:
            self.start_job_status_loop()

    def _cancel_job(self, req: JobRequest) -> None:
        """
        This cancels a running job. If the job has already been canceled, then nothing is
        done.
        If the job doesn't exist (or the job id in the request is None), this raises a ValueError.
        If there's an error while attempting to cancel, this raises a NarrativeError.
        In the end, after a successful cancel, this finishes up by fetching and returning the
        job state with the new status.
        """
        self._verify_job_id(req)
        try:
            self._jm.cancel_job(req.job_id)
        except ValueError as e:
            self.send_error_message("job_does_not_exist", req)
            raise
        except NarrativeException as e:
            self.send_error_message(
                "job_comm_error", req, {
                    "error": "Unable to cancel job",
                    "message": getattr(e, "message", "Unknown reason"),
                    "code": getattr(e, "code", -1),
                    "name": getattr(e, "name",
                                    type(e).__name__)
                })
            raise
        self._lookup_job_state(req)

    def _get_job_logs(self, req: JobRequest) -> None:
        """
        This returns a set of job logs based on the info in the request.
        """
        self._verify_job_id(req)
        first_line = req.rq_data.get("first_line", 0)
        num_lines = req.rq_data.get("num_lines", None)
        latest_only = req.request == "job_logs_latest"
        try:
            (first_line, max_lines,
             logs) = self._jm.get_job_logs(req.job_id,
                                           num_lines=num_lines,
                                           first_line=first_line,
                                           latest_only=latest_only)
            self.send_comm_message(
                "job_logs", {
                    "job_id": req.job_id,
                    "first": first_line,
                    "max_lines": max_lines,
                    "lines": logs,
                    "latest": latest_only
                })
        except ValueError as e:
            self.send_error_message("job_does_not_exist", req)
            raise
        except NarrativeException as e:
            self.send_error_message(
                "job_comm_error", req, {
                    "error": "Unable to retrieve job logs",
                    "message": getattr(e, "message", "Unknown reason"),
                    "code": getattr(e, "code", -1),
                    "name": getattr(e, "name",
                                    type(e).__name__)
                })
            raise

    def _handle_comm_message(self, msg: dict) -> None:
        """
        Handles comm messages that come in from the other end of the KBaseJobs channel.
        Messages get translated into a JobRequest object, which is then passed to the
        right handler, based on the request.

        A handler dictionary is created on JobComm creation.

        Any unknown request is returned over the channel as a job_comm_error, and a
        ValueError is raised.
        """
        request = JobRequest(msg)
        kblogging.log_event(self._log, "handle_comm_message",
                            {"msg": request.request})
        if request.request in self._msg_map:
            self._msg_map[request.request](request)
        else:
            self.send_comm_message("job_comm_error", {
                "message": "Unknown message",
                "request_type": request.request
            })
            raise ValueError(f"Unknown KBaseJobs message '{request.request}'")

    def send_comm_message(self, msg_type: str, content: dict) -> None:
        """
        Sends a ipykernel.Comm message to the KBaseJobs channel with the given msg_type
        and content. These just get encoded into the message itself.
        """
        msg = {"msg_type": msg_type, "content": content}
        self._comm.send(msg)

    def send_error_message(self,
                           err_type: str,
                           req: JobRequest,
                           content: dict = None) -> None:
        """
        Sends a comm message over the KBaseJobs channel as an error. This will have msg_type as
        whatever the error type is, and include the original request in the message content as
        "source".

        This sends a packet that looks like:
        {
            job_id: (string, if relevant),
            source: the original message that spawned the error,
            other fields about the error, dependent on the content.
        }
        """
        error_content = {"job_id": req.job_id, "source": req.request}
        if content is not None:
            error_content.update(content)
        self.send_comm_message(err_type, error_content)
Exemplo n.º 8
0
class JobManager(object):
    """
    The KBase Job Manager class. This handles all jobs and makes their status available.
    It sends the results to the KBaseJobs channel that the front end
    listens to.
    """

    __instance = None

    # keys: job_id, values: { refresh = 1/0, job = Job object }
    _running_jobs = dict()
    # keys: cell_id, values: set(job_1_id, job_2_id, job_3_id)
    _jobs_by_cell_id = dict()

    _log = kblogging.get_logger(__name__)

    def __new__(cls):
        if JobManager.__instance is None:
            JobManager.__instance = object.__new__(cls)
        return JobManager.__instance

    @staticmethod
    def _reorder_parents_children(states: dict) -> dict:
        ordering = []
        for job_id, state in states.items():
            if state.get("batch_job"):
                ordering.append(job_id)
            else:
                ordering.insert(0, job_id)
        states = {job_id: states[job_id] for job_id in ordering}

        return states

    def _check_job_list(self, input_ids: List[str] = []) -> Tuple[List[str], List[str]]:
        """
        Deduplicates the input job list, maintaining insertion order
        Any jobs not present in self._running_jobs are added to an error list

        :param input_ids: a list of putative job IDs
        :return results: tuple with items "job_ids", containing valid IDs;
        and "error_ids", for jobs that the narrative backend does not know about
        """
        if not isinstance(input_ids, list):
            raise JobRequestException(f"{JOBS_TYPE_ERR}: {input_ids}")

        job_ids = []
        error_ids = []
        for input_id in input_ids:
            if input_id and input_id not in job_ids + error_ids:
                if input_id in self._running_jobs:
                    job_ids.append(input_id)
                else:
                    error_ids.append(input_id)

        if not len(job_ids) + len(error_ids):
            raise JobRequestException(JOBS_MISSING_ERR, input_ids)

        return job_ids, error_ids

    def register_new_job(self, job: Job, refresh: bool = None) -> None:
        """
        Registers a new Job with the manager and stores the job locally.
        This should only be invoked when a new Job gets started.

        Parameters:
        -----------
        job : biokbase.narrative.jobs.job.Job object
            The new Job that was started.
        """
        kblogging.log_event(self._log, "register_new_job", {"job_id": job.job_id})

        if refresh is None:
            refresh = not job.was_terminal()
        self._running_jobs[job.job_id] = {"job": job, "refresh": refresh}

        # add the new job to the _jobs_by_cell_id mapping if there is a cell_id present
        if job.cell_id:
            if job.cell_id not in self._jobs_by_cell_id.keys():
                self._jobs_by_cell_id[job.cell_id] = set()

            self._jobs_by_cell_id[job.cell_id].add(job.job_id)
            if job.batch_id:
                self._jobs_by_cell_id[job.cell_id].add(job.batch_id)

    def initialize_jobs(self, cell_ids: List[str] = None) -> None:
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. gets the current workspace ID from app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from ee2 (also gets tag, cell_id, run_id)
        3. initialize the Job objects and add them to the running jobs list
        4. start the status lookup loop.
        """
        ws_id = system_variable("workspace_id")
        job_states = dict()
        kblogging.log_event(self._log, "JobManager.initialize_jobs", {"ws_id": ws_id})
        try:
            job_states = clients.get("execution_engine2").check_workspace_jobs(
                {
                    "workspace_id": ws_id,
                    "return_list": 0,  # do not remove
                    "exclude_fields": JOB_INIT_EXCLUDED_JOB_STATE_FIELDS,
                }
            )
        except Exception as e:
            kblogging.log_event(self._log, "init_error", {"err": str(e)})
            new_e = transform_job_exception(e, "Unable to initialize jobs")
            raise new_e

        self._running_jobs = dict()
        job_states = self._reorder_parents_children(job_states)
        for job_state in job_states.values():
            child_jobs = None
            if job_state.get("batch_job"):
                child_jobs = [
                    self.get_job(child_id)
                    for child_id in job_state.get("child_jobs", [])
                ]

            job = Job(job_state, children=child_jobs)

            # Set to refresh when job is not in terminal state
            # and when job is present in cells (if given)
            # and when it is not part of a batch
            refresh = not job.was_terminal() and not job.batch_id
            if cell_ids is not None:
                refresh = refresh and job.in_cells(cell_ids)

            self.register_new_job(job, refresh)

    def _create_jobs(self, job_ids) -> dict:
        """
        TODO: error handling
        Given a list of job IDs, creates job objects for them and populates the _running_jobs dictionary
        """
        job_ids = [job_id for job_id in job_ids if job_id not in self._running_jobs]
        if not len(job_ids):
            return {}

        job_states = clients.get("execution_engine2").check_jobs(
            {
                "job_ids": job_ids,
                "exclude_fields": JOB_INIT_EXCLUDED_JOB_STATE_FIELDS,
                "return_list": 0,
            }
        )
        for job_state in job_states.values():
            # do not set new jobs to be automatically refreshed - if the front end wants them
            # refreshed, it'll make a request.
            self.register_new_job(job=Job(job_state), refresh=False)

        return job_states

    def get_job(self, job_id):
        """
        Returns a Job with the given job_id.
        Raises a JobRequestException if not found.
        """
        if job_id not in self._running_jobs:
            raise JobRequestException(JOB_NOT_REG_ERR, job_id)
        return self._running_jobs[job_id]["job"]

    def _construct_job_output_state_set(
        self, job_ids: List[str], states: dict = None
    ) -> dict:
        """
        Builds a set of job states for the list of job ids.
        :param job_ids: list of job IDs (may be empty)
        :param states: dict, where each value is a state is from EE2
        """
        if not isinstance(job_ids, list):
            raise JobRequestException("job_ids must be a list")

        if not len(job_ids):
            return {}

        output_states = dict()
        jobs_to_lookup = list()

        # Fetch from cache of terminated jobs, where available.
        # These are already post-processed and ready to return.
        for job_id in job_ids:
            job = self.get_job(job_id)
            if job.was_terminal():
                output_states[job_id] = job.output_state()
            elif states and job_id in states:
                state = states[job_id]
                output_states[job_id] = job.output_state(state)
            else:
                jobs_to_lookup.append(job_id)

        fetched_states = dict()
        # Get the rest of states direct from EE2.
        if len(jobs_to_lookup):
            try:
                fetched_states = clients.get("execution_engine2").check_jobs(
                    {
                        "job_ids": jobs_to_lookup,
                        "exclude_fields": EXCLUDED_JOB_STATE_FIELDS,
                        "return_list": 0,
                    }
                )
            except Exception as e:
                error_message = str(e)
                kblogging.log_event(
                    self._log,
                    "_construct_job_output_state_set",
                    {"exception": error_message},
                )

            # fill in the output states for the missing jobs
            # if the job fetch failed, add an error message to the output
            # and return the cached job state
            for job_id in jobs_to_lookup:
                job = self.get_job(job_id)
                if job_id in fetched_states:
                    output_states[job_id] = job.output_state(fetched_states[job_id])
                else:
                    # fetch the current state without updating it
                    output_states[job_id] = job.output_state({})
                    # add an error field with the error message from the failed look up
                    output_states[job_id]["error"] = error_message

        return output_states

    def get_job_states(self, job_ids: List[str]) -> dict:
        job_ids, error_ids = self._check_job_list(job_ids)
        output_states = self._construct_job_output_state_set(job_ids)
        return self.add_errors_to_results(output_states, error_ids)

    def get_all_job_states(self, ignore_refresh_flag=False) -> dict:
        """
        Fetches states for all running jobs.
        If ignore_refresh_flag is True, then returns states for all jobs this
        JobManager knows about (i.e. all jobs associated with the workspace).

        This returns them all as a dictionary, keyed on the job id.
        :param ignore_refresh_flag: boolean - if True, ignore the usual refresh state of the job.
            Even if the job is stopped, or completed, fetch and return its state from the service.
        """
        jobs_to_lookup = list()

        # grab the list of running job ids, so we don't run into update-while-iterating problems.
        for job_id in self._running_jobs.keys():
            if self._running_jobs[job_id]["refresh"] or ignore_refresh_flag:
                jobs_to_lookup.append(job_id)
        if len(jobs_to_lookup) > 0:
            return self._construct_job_output_state_set(jobs_to_lookup)
        return dict()

    def _get_job_ids_by_cell_id(self, cell_id_list: List[str] = None) -> tuple:
        """
        Finds jobs with a cell_id in cell_id_list
        Mappings of job ID to cell ID are added when new jobs are registered
        Returns a list of job IDs and a mapping of cell IDs to the list of
        job IDs associated with the cell.
        """
        if not cell_id_list:
            raise JobRequestException(CELLS_NOT_PROVIDED_ERR)

        cell_to_job_mapping = {
            id: self._jobs_by_cell_id[id] if id in self._jobs_by_cell_id else set()
            for id in cell_id_list
        }
        # union of all the job_ids in the cell_to_job_mapping
        job_id_list = set().union(*cell_to_job_mapping.values())
        return (job_id_list, cell_to_job_mapping)

    def get_job_states_by_cell_id(self, cell_id_list: List[str] = None) -> dict:
        """
        Fetch job states for jobs with a cell_id in cell_id_list
        Returns a dictionary of job states keyed by job ID and a mapping of
        cell IDs to the list of job IDs associated with the cell.
        """
        (jobs_to_lookup, cell_to_job_mapping) = self._get_job_ids_by_cell_id(
            cell_id_list
        )
        job_states = {}
        if len(jobs_to_lookup) > 0:
            job_states = self._construct_job_output_state_set(list(jobs_to_lookup))

        return {"jobs": job_states, "mapping": cell_to_job_mapping}

    def get_job_info(self, job_ids: List[str]) -> dict:
        """
        Sends the info over the comm channel as these packets:
        {
            app_id: module/name,
            app_name: random string,
            job_id: string,
            job_params: dictionary,
            batch_id: string,
        }
        Will set packet to the generic job not found message if job_id doesn't exist.
        """
        job_ids, error_ids = self._check_job_list(job_ids)

        infos = dict()
        for job_id in job_ids:
            job = self.get_job(job_id)
            infos[job_id] = {
                "app_id": job.app_id,
                "app_name": job.app_name,
                "batch_id": job.batch_id,
                "job_id": job_id,
                "job_params": job.params,
            }
        return self.add_errors_to_results(infos, error_ids)

    def get_job_logs(
        self,
        job_id: str,
        first_line: int = 0,
        num_lines: int = None,
        latest: bool = False,
    ) -> dict:
        """
        :param job_id: str - the job id from the execution engine
        :param first_line: int - the first line to be requested by the log. 0-indexed. If < 0,
            this will be set to 0
        :param num_lines: int - the maximum number of lines to return.
            if < 0, will be reset to 0.
            if None, then will not be considered, and just return all the lines.
        :param latest: bool - if True, will only return the most recent max_lines
            of logs. This overrides the first_line parameter if set to True. If the call made is
            get_job_logs(id, first_line=0, num_lines=5, latest=True), and there are 100
            log lines available, then lines 96-100 will be returned.
        :returns: dict with keys:
            job_id:     string
            batch_id:   string | None
            first:      int - the first line returned
            latest:     bool - whether the latest lines were returned
            max_lines:  int - the number of logs lines currently available for that job
            lines:      list - the lines themselves, fresh from the server. These are all tiny dicts with keys
                "line" - the log line string
                "is_error" - either 0 or 1

            If there is an error when retrieving logs (e.g. the job
            has yet to start or it is a batch job and does not generate
            logs), the return structure will be:
                job_id:     string
                batch_id:   string | None
                error:      string - error message
        """
        job = self.get_job(job_id)

        if first_line < 0:
            first_line = 0
        if num_lines is not None and num_lines < 0:
            num_lines = 0

        try:
            if latest:
                (max_lines, logs) = job.log()
                if num_lines is None or max_lines <= num_lines:
                    first_line = 0
                else:
                    first_line = max_lines - num_lines
                    logs = logs[first_line:]
            else:
                (max_lines, logs) = job.log(first_line=first_line, num_lines=num_lines)

            return {
                "job_id": job.job_id,
                "batch_id": job.batch_id,
                "first": first_line,
                "latest": True if latest else False,
                "max_lines": max_lines,
                "lines": logs,
            }
        except Exception as e:
            return {
                "job_id": job.job_id,
                "batch_id": job.batch_id,
                "error": e.message,
            }

    def get_job_logs_for_list(
        self,
        job_id_list: List[str],
        first_line: int = 0,
        num_lines: int = None,
        latest: bool = False,
    ) -> dict:
        """
        Fetch the logs for a list of jobs. Note that the parameters supplied are applied to all jobs.
        """
        job_ids, error_ids = self._check_job_list(job_id_list)

        output = {}
        for job_id in job_ids:
            output[job_id] = self.get_job_logs(job_id, first_line, num_lines, latest)

        return self.add_errors_to_results(output, error_ids)

    def cancel_jobs(self, job_id_list: List[str]) -> dict:
        """
        Cancel a list of running jobs, placing them in a canceled state
        Does NOT delete the jobs.
        If the job_ids are not present or are not found in the Narrative,
        a JobRequestException is raised.

        Results are returned as a dict of job status objects keyed by job id

        :param job_id_list: list of strs
        :return job_states: dict with keys job IDs and values job state objects

        """
        job_ids, error_ids = self._check_job_list(job_id_list)
        error_states = dict()
        for job_id in job_ids:
            if not self.get_job(job_id).was_terminal():
                error = self._cancel_job(job_id)
                if error:
                    error_states[job_id] = error.message

        job_states = self._construct_job_output_state_set(job_ids)
        for job_id in error_states:
            job_states[job_id]["error"] = error_states[job_id]

        return self.add_errors_to_results(job_states, error_ids)

    def _cancel_job(self, job_id: str) -> None:
        # Stop updating the job status while we try to cancel.
        # Set the job to a special state of 'canceling' while we're doing the cancel
        is_refreshing = self._running_jobs[job_id].get("refresh", False)
        self._running_jobs[job_id]["refresh"] = False
        self._running_jobs[job_id]["canceling"] = True
        error = None
        try:
            clients.get("execution_engine2").cancel_job({"job_id": job_id})
        except Exception as e:
            error = transform_job_exception(e, "Unable to cancel job")
        self._running_jobs[job_id]["refresh"] = is_refreshing
        del self._running_jobs[job_id]["canceling"]
        return error

    def retry_jobs(self, job_id_list: List[str]) -> dict:
        """
        Returns
        [
            {
                "job_id": job_id,
                "job": {"state": {"job_id": job_id, "status": status, ...} ...},
                "retry_id": retry_id,
                "retry": {"state": {"job_id": retry_id, "status": status, ...} ...}
            },
            {
                "job": {"state": {"job_id": job_id, "status": status, ...} ...},
                "error": "..."
            }
            ...
            {
                "job": {"state": {"job_id": job_id, "status": DOES_NOT_EXIST}},
                "error": f"Cannot find job with ID {job_id}",
            }
        ]
        where the innermost dictionaries are job states from ee2 and are within the
        job states from job.output_state()
        """
        job_ids, error_ids = self._check_job_list(job_id_list)
        try:
            retry_results = clients.get("execution_engine2").retry_jobs(
                {"job_ids": job_ids}
            )
        except Exception as e:
            raise transform_job_exception(e, "Unable to retry job(s)")
        # for each retry result, refresh the state of the retried and new jobs
        orig_ids = [result["job_id"] for result in retry_results]
        retry_ids = [
            result["retry_id"] for result in retry_results if "retry_id" in result
        ]
        orig_states = self._construct_job_output_state_set(orig_ids)
        retry_states = self._construct_job_output_state_set(
            retry_ids, self._create_jobs(retry_ids)  # add to self._running_jobs index
        )
        job_states = {**orig_states, **retry_states}

        results_by_job_id = {}
        # fill in the job state details
        for result in retry_results:
            job_id = result["job_id"]
            results_by_job_id[job_id] = {"job_id": job_id, "job": job_states[job_id]}
            if "retry_id" in result:
                retry_id = result["retry_id"]
                results_by_job_id[job_id]["retry_id"] = retry_id
                results_by_job_id[job_id]["retry"] = job_states[retry_id]
            if "error" in result:
                results_by_job_id[job_id]["error"] = result["error"]
        return self.add_errors_to_results(results_by_job_id, error_ids)

    def add_errors_to_results(self, results: dict, error_ids: List[str]) -> dict:
        """
        Add the generic "not found" error for each job_id in error_ids
        """
        for error_id in error_ids:
            results[error_id] = {
                "job_id": error_id,
                "error": f"Cannot find job with ID {error_id}",
            }
        return results

    def modify_job_refresh(self, job_ids: List[str], update_refresh: bool) -> None:
        """
        Modifies how many things want to get the job updated.
        If this sets the current "refresh" key to be less than 0, it gets reset to 0.
        Jobs that are not present in the _running_jobs dictionary are ignored.
        """
        job_ids, _ = self._check_job_list(job_ids)

        for job_id in job_ids:
            self._running_jobs[job_id]["refresh"] = update_refresh

    def update_batch_job(self, batch_id: str) -> List[str]:
        """
        Update a batch job and create child jobs if necessary
        """
        batch_job = self.get_job(batch_id)
        if not batch_job.batch_job:
            raise JobRequestException(JOB_NOT_BATCH_ERR, batch_id)

        child_ids = batch_job.child_jobs

        reg_child_jobs = []
        unreg_child_ids = []
        for job_id in child_ids:
            if job_id in self._running_jobs:
                reg_child_jobs.append(self.get_job(job_id))
            else:
                unreg_child_ids.append(job_id)

        unreg_child_jobs = []
        if unreg_child_ids:
            unreg_child_jobs = Job.from_job_ids(unreg_child_ids)
            for job in unreg_child_jobs:
                self.register_new_job(
                    job=job,
                    refresh=not job.was_terminal(),
                )

        batch_job.update_children(reg_child_jobs + unreg_child_jobs)

        return [batch_id] + child_ids

    def list_jobs(self):
        """
        List all job ids, their info, and status in a quick HTML format.
        """
        try:
            all_states = self.get_all_job_states(ignore_refresh_flag=True)
            state_list = [copy.deepcopy(s["jobState"]) for s in all_states.values()]

            if not len(state_list):
                return "No running jobs!"

            state_list = sorted(state_list, key=lambda s: s.get("created", 0))
            for state in state_list:
                job = self.get_job(state["job_id"])
                state["created"] = datetime.fromtimestamp(
                    state["created"] / 1000.0
                ).strftime("%Y-%m-%d %H:%M:%S")
                state["run_time"] = "Not started"
                state["user"] = job.user
                state["app_id"] = job.app_id
                state["batch_id"] = job.batch_id
                exec_start = state.get("running", None)

                if state.get("finished"):
                    finished_time = datetime.fromtimestamp(
                        state.get("finished") / 1000.0
                    )
                    state["finish_time"] = finished_time.strftime("%Y-%m-%d %H:%M:%S")
                    if exec_start:
                        exec_start_time = datetime.fromtimestamp(exec_start / 1000.0)
                        delta = finished_time - exec_start_time
                        delta = delta - timedelta(microseconds=delta.microseconds)
                        state["run_time"] = str(delta)
                elif exec_start:
                    exec_start_time = datetime.fromtimestamp(
                        exec_start / 1000.0
                    ).replace(tzinfo=timezone.utc)
                    delta = datetime.now(timezone.utc) - exec_start_time
                    delta = delta - timedelta(microseconds=delta.microseconds)
                    state["run_time"] = str(delta)

            tmpl = """
            <table class="table table-bordered table-striped table-condensed">
                <tr>
                    <th>Id</th>
                    <th>Name</th>
                    <th>Submitted</th>
                    <th>Batch ID</th>
                    <th>Submitted By</th>
                    <th>Status</th>
                    <th>Run Time</th>
                    <th>Complete Time</th>
                </tr>
                {% for j in jobs %}
                <tr>
                    <td class="job_id">{{ j.job_id|e }}</td>
                    <td class="app_id">{{ j.app_id|e }}</td>
                    <td class="created">{{ j.created|e }}</td>
                    <td class="batch_id">{{ j.batch_id|e }}</td>
                    <td class="user">{{ j.user|e }}</td>
                    <td class="status">{{ j.status|e }}</td>
                    <td class="run_time">{{ j.run_time|e }}</td>
                    <td class="finish_time">{% if j.finish_time %}{{ j.finish_time|e }}{% else %}Incomplete{% endif %}</td>
                </tr>
                {% endfor %}
            </table>
            """
            return HTML(Template(tmpl).render(jobs=state_list))

        except Exception as e:
            kblogging.log_event(self._log, "list_jobs.error", {"err": str(e)})
            raise
Exemplo n.º 9
0
class AppManager(object):
    """
    The main class for managing how KBase apps get run. This contains functions
    for showing app descriptions, their usage (how to invoke various
    parameters), and, ultimately, for running the app.

    A typical flow might be like this.
    am = AppManager()
    am.available_apps()
        # show the set of apps with a brief description of each.
    am.app_usage(app_id)
        # show how to use a app and set its parameters.
    job = am.run_app(app_id, input1=value1, input2=value2, ...)
        # run an app with given inputs.
    """
    __instance = None

    __MAX_TOKEN_NAME_LEN = 30

    spec_manager = specmanager.SpecManager()
    _log = kblogging.get_logger(__name__)
    _comm = None
    viewer_count = 1

    def __new__(cls):
        if AppManager.__instance is None:
            AppManager.__instance = object.__new__(cls)
            AppManager.__instance._comm = None
        return AppManager.__instance

    def reload(self):
        """
        Reloads all app specs into memory from the App Catalog.
        Any outputs of app_usage, app_description, or available_apps
        should be run again after the update.
        """
        self.spec_manager.reload()

    def app_usage(self, app_id, tag='release'):
        """
        This shows the list of inputs and outputs for a given app with a given
        tag. By default, this is done in a pretty HTML way, but this app can be
        wrapped in str() to show a bare formatted string.

        If either the app_id is unknown, or isn't found with the given release
        tag, or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev
            (default=release)
        """
        return self.spec_manager.app_usage(app_id, tag)

    def app_description(self, app_id, tag='release'):
        """
        Returns the app description in a printable HTML format.

        If either the app_id is unknown, or isn't found with the given release
        tag, or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev
            (default=release)
        """
        return self.spec_manager.app_description(app_id, tag)

    def available_apps(self, tag="release"):
        """
        Lists the set of available apps for a given tag in a simple table.
        If the tag is not found, a ValueError will be raised.

        Parameters:
        -----------
        tag : Which version of the list of apps to view - either release, beta,
            or dev (default=release)

        """
        return self.spec_manager.available_apps(tag)

    def run_app_batch(self,
                      app_id,
                      params,
                      tag="release",
                      version=None,
                      cell_id=None,
                      run_id=None,
                      dry_run=False):
        try:
            if params is None:
                params = list()
            return self._run_app_batch_internal(app_id, params, tag, version,
                                                cell_id, run_id, dry_run)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc()
            e_trace = e_trace.replace('<', '&lt;').replace('>', '&gt;')
            e_code = getattr(e, 'code', -1)
            e_source = getattr(e, 'source', 'appmanager')
            self._send_comm_message(
                'run_status', {
                    'event': 'error',
                    'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                    'cell_id': cell_id,
                    'run_id': run_id,
                    'error_message': e_message,
                    'error_type': e_type,
                    'error_stacktrace': e_trace,
                    'error_code': e_code,
                    'error_source': e_source
                })
            print("Error while trying to start your app (run_app_batch)!\n" +
                  "-----------------------------------------------------\n" +
                  str(e) + "\n" +
                  "-----------------------------------------------------\n" +
                  e_trace)
            return

    def _run_app_batch_internal(self, app_id, params, tag, version, cell_id,
                                run_id, dry_run):
        batch_method = "kb_BatchApp.run_batch"
        batch_app_id = "kb_BatchApp/run_batch"
        batch_method_ver = "dev"
        batch_method_tag = "dev"
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        # A list of lists of UPAs, used for each subjob.
        batch_ws_upas = list()
        # The list of actual input values, post-mapping.
        batch_run_inputs = list()

        for param_set in params:
            spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                                   for i in range(len(spec_params)))
            batch_ws_upas.append(
                extract_ws_refs(app_id, tag, spec_params, param_set))
            batch_run_inputs.append(
                self._map_inputs(spec['behavior']['kb_service_input_mapping'],
                                 param_set, spec_params_map))

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        job_meta = {
            'tag': batch_method_tag,
            'batch_app': app_id,
            'batch_tag': tag,
            'batch_size': len(params),
        }
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # Now put these all together in a way that can be sent to the batch processing app.
        batch_params = [{
            "module_name":
            service_name,
            "method_name":
            service_method,
            "service_ver":
            service_ver,
            "wsid":
            ws_id,
            "meta":
            job_meta,
            "batch_params": [{
                "params": batch_run_inputs[i],
                "source_ws_objects": batch_ws_upas[i]
            } for i in range(len(batch_run_inputs))],
        }]

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(),
                                               token_name=token_name)
        except Exception as e:
            raise

        job_meta['token_id'] = agent_token['id']
        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': batch_method,
            'service_ver': batch_method_ver,
            'params': batch_params,
            'app_id': batch_app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        # if len(ws_input_refs) > 0:
        #     job_runner_inputs['source_ws_objects'] = ws_input_refs

        # if we're doing a dry run, just return the inputs that we made.
        if dry_run:
            return job_runner_inputs

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': batch_method_tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_batch_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2",
                token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_batch_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      batch_app_id,
                      batch_params,
                      system_variable('user_id'),
                      tag=batch_method_tag,
                      app_version=batch_method_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'],
                      meta=job_meta)

        self._send_comm_message(
            'run_status', {
                'event': 'launched_job',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'job_id': job_id
            })
        self.register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job

    def run_app(self,
                app_id,
                params,
                tag="release",
                version=None,
                cell_id=None,
                run_id=None,
                dry_run=False):
        """
        Attempts to run the app, returns a Job with the running app info.
        If this is given a cell_id, then returns None. If not, it returns the
        generated Job object.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - this is hte dictionary of parameters to tbe used with the app.
                 They can be found by using the app_usage function. If any
                 non-optional apps are missing, a ValueError will be raised.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.

        Example:
        --------
        run_app('MegaHit/run_megahit',
                {
                    'read_library_name' : 'My_PE_Library',
                    'output_contigset_name' : 'My_Contig_Assembly'
                },
                version='>=1.0.0'
        )
        """

        try:
            if params is None:
                params = dict()
            return self._run_app_internal(app_id, params, tag, version,
                                          cell_id, run_id, dry_run)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc()
            e_trace = e_trace.replace('<', '&lt;').replace('>', '&gt;')
            e_code = getattr(e, 'code', -1)
            e_source = getattr(e, 'source', 'appmanager')
            self._send_comm_message(
                'run_status', {
                    'event': 'error',
                    'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                    'cell_id': cell_id,
                    'run_id': run_id,
                    'error_message': e_message,
                    'error_type': e_type,
                    'error_stacktrace': e_trace,
                    'error_code': e_code,
                    'error_source': e_source
                })
            print("Error while trying to start your app (run_app)!\n" +
                  "-----------------------------------------------\n" +
                  str(e) + "\n" +
                  "-----------------------------------------------\n" +
                  e_trace)
            return

    def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id,
                          dry_run):
        """
        Attemps to run the app, returns a Job with the running app info.
        Should *hopefully* also inject that app into the Narrative's metadata.
        Probably need some kind of JavaScript-foo to get that to work.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - a dictionary of parameters.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.
        """
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                               for i in range(len(spec_params)))
        ws_input_refs = extract_ws_refs(app_id, tag, spec_params, params)
        input_vals = self._map_inputs(
            spec['behavior']['kb_service_input_mapping'], params,
            spec_params_map)

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + '.' + service_method
        job_meta = {'tag': tag}
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': function_name,
            'service_ver': service_ver,
            'params': input_vals,
            'app_id': app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        if len(ws_input_refs) > 0:
            job_runner_inputs['source_ws_objects'] = ws_input_refs
        if dry_run:
            return job_runner_inputs

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(),
                                               token_name=token_name)
        except Exception as e:
            raise
        job_runner_inputs['meta']['token_id'] = agent_token['id']

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2",
                token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      app_id,
                      input_vals,
                      system_variable('user_id'),
                      tag=tag,
                      app_version=service_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'])

        self._send_comm_message(
            'run_status', {
                'event': 'launched_job',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'job_id': job_id
            })
        self.register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job

    def run_local_app(self,
                      app_id,
                      params,
                      tag="release",
                      version=None,
                      cell_id=None,
                      run_id=None,
                      widget_state=None):
        """
        Attempts to run a local app. These do not return a Job object, but just
        the result of the app. In most cases, this will be a Javascript display
        of the result, but could be anything.

        If the app_spec looks like it makes a service call, then this raises a
        ValueError. Otherwise, it validates each parameter in **kwargs against
        the app spec, executes it, and returns the result.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'view_expression_profile'
        params - the dictionary of parameters for the app. Should be key-value
                 pairs where they keys are strings. If any non-optional
                 parameters are missing, an informative string will be printed.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.

        Example:
        run_local_app('NarrativeViewers/view_expression_profile',
                      {
                          "input_expression_matrix": "MyMatrix",
                          "input_gene_ids": "1234"
                      },
                      version='0.0.1',
                      input_expression_matrix="MyMatrix")
        """
        try:
            if params is None:
                params = dict()
            return self._run_local_app_internal(app_id, params, widget_state,
                                                tag, version, cell_id, run_id)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc()
            e_trace = e_trace.replace('<', '&lt;').replace('>', '&gt;')
            self._send_comm_message(
                'run_status', {
                    'event': 'error',
                    'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                    'cell_id': cell_id,
                    'run_id': run_id,
                    'error_message': e_message,
                    'error_type': e_type,
                    'error_stacktrace': e_trace
                })
            # raise
            print("Error while trying to start your app (run_local_app)!\n" +
                  "-------------------------------------\n" + str(e))

    def _run_local_app_internal(self, app_id, params, widget_state, tag,
                                version, cell_id, run_id):
        self._send_comm_message(
            'run_status', {
                'event': 'validating_app',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id
            })

        spec = self._get_validated_app_spec(app_id,
                                            tag,
                                            False,
                                            version=version)

        # Here, we just deal with two behaviors:
        # 1. None of the above - it's a viewer.
        # 2. ***TODO*** python_class / python_function.
        #    Import and exec the python code.

        # for now, just map the inputs to outputs.
        # First, validate.
        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)
        (params, ws_refs) = validate_parameters(app_id, tag, spec_params,
                                                params)

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'username': system_variable('user_id'),
            'ws': system_variable('workspace')
        }
        kblogging.log_event(self._log, "run_local_app", log_info)

        self._send_comm_message(
            'run_status', {
                'event': 'success',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id
            })

        (output_widget, widget_params) = map_outputs_from_state([], params,
                                                                spec)

        # All a local app does is route the inputs to outputs through the
        # spec's mapping, and then feed that into the specified output widget.
        wm = WidgetManager()
        if widget_state is not None:
            return wm.show_advanced_viewer_widget(output_widget,
                                                  widget_params,
                                                  widget_state,
                                                  cell_id=cell_id,
                                                  tag=tag)
        else:
            return wm.show_output_widget(output_widget,
                                         widget_params,
                                         cell_id=cell_id,
                                         tag=tag)

    def run_local_app_advanced(self,
                               app_id,
                               params,
                               widget_state,
                               tag="release",
                               version=None,
                               cell_id=None,
                               run_id=None):
        return self.run_local_app(app_id,
                                  params,
                                  widget_state=widget_state,
                                  tag=tag,
                                  version=version,
                                  cell_id=cell_id,
                                  run_id=run_id)

    def run_dynamic_service(self,
                            app_id,
                            params,
                            tag="release",
                            version=None,
                            cell_id=None,
                            run_id=None):
        """
        Attempts to run a local app. These do not return a Job object, but just
        the result of the app. In most cases, this will be a Javascript display
        of the result, but could be anything.

        If the app_spec looks like it makes a service call, then this raises a ValueError.
        Otherwise, it validates each parameter in **kwargs against the app spec, executes it, and
        returns the result.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'view_expression_profile'
        params - the dictionary of parameters for the app. Should be key-value
                 pairs where they keys are strings. If any non-optional
                 parameters are missing, an informative string will be printed.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        run_local_app('NarrativeViewers/view_expression_profile', version='0.0.1',
                      input_expression_matrix="MyMatrix", input_gene_ids="1234")
        """
        try:
            if params is None:
                params = dict()
            return self._run_dynamic_service_internal(app_id, params, tag,
                                                      version, cell_id, run_id,
                                                      **kwargs)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace(
                '>', '&gt;')

            if cell_id:
                self.send_cell_message(
                    'result', cell_id, run_id, {
                        'error': {
                            'message': e_message,
                            'type': e_type,
                            'stacktrace': e_trace
                        }
                    })
            else:
                print("Error while trying to start your app (run_local_app)!" +
                      "\n-------------------------------------\n" + str(e))

    def _run_dynamic_service_internal(self, app_id, params, tag, version,
                                      cell_id, run_id):
        spec = self._get_validated_app_spec(app_id,
                                            tag,
                                            False,
                                            version=version)

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'username': system_variable('user_id'),
            'ws': system_variable('workspace')
        }
        kblogging.log_event(self._log, "run_dynamic_service", log_info)

        # Silly to keep this here, but we do not validate the incoming parameters.
        # If they are provided by the UI (we have cell_id), they are constructed
        # according to the spec, so are trusted;
        # Otherwise, if they are the product of direct code cell entry, this is a mode we do not
        # "support", so we can let it fail hard.
        # In the future when code cell interaction is supported for users, we will need to provide
        # robust validation and error reporting, but this may end up being (should be) provided by the
        # sdk execution infrastructure anyway

        input_vals = params
        function_name = spec['behavior']['kb_service_name'] + '.' + spec[
            'behavior']['kb_service_method']
        try:
            result = clients.get("service").sync_call(function_name,
                                                      input_vals,
                                                      service_version=tag)[0]
            # if a ui call (a cell_id is defined) we send a result message, otherwise
            # just the raw result for display in a code cell. This is how we "support"
            # code cells for internal usage.
            if cell_id:
                self.send_cell_message('result', cell_id, run_id,
                                       {'result': result})
            else:
                return result
        except:
            raise

    def send_cell_message(self, message_id, cell_id, run_id, message):
        address = {
            'cell_id': cell_id,
            'run_id': run_id,
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z'
        }

        self._send_comm_message(message_id, {
            'address': address,
            'message': message
        })

    def _get_validated_app_spec(self, app_id, tag, is_long, version=None):
        if version is not None and tag != "release":
            if re.match(r'\d+\.\d+\.\d+', version) is not None:
                raise ValueError(
                    "Semantic versions only apply to released app modules. " +
                    "You can use a Git commit hash instead to specify a " +
                    "version.")
        self.spec_manager.check_app(app_id, tag, raise_exception=True)
        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)
        if 'behavior' not in spec:
            raise ValueError(
                "This app appears invalid - it has no defined behavior")
        if 'script_module' in spec['behavior'] or 'script_name' in spec[
                'behavior']:
            # It's an old NJS script. These don't work anymore.
            raise ValueError(
                'This app relies on a service that is now obsolete. Please contact '
                + 'the administrator.')
        if is_long and 'kb_service_input_mapping' not in spec['behavior']:
            raise ValueError("This app does not appear to be a long-running " +
                             "job! Please use 'run_local_app' to start this " +
                             "instead.")
        return spec

    def _map_group_inputs(self, value, spec_param, spec_params):
        if isinstance(value, list):
            return [
                self._map_group_inputs(v, spec_param, spec_params)
                for v in value
            ]
        elif value is None:
            return None
        else:
            mapped_value = dict()
            id_map = spec_param.get('id_mapping', {})
            for param_id in id_map:
                # ensure that the param referenced in the group param list
                # exists in the spec.
                # NB: This should really never happen if the sdk registration
                # process validates them.
                if param_id not in spec_params:
                    msg = "Unknown parameter id in group mapping: " + param_id
                    raise ValueError(msg)
            for param_id in value:
                target_key = id_map.get(param_id, param_id)
                # Sets either the raw value, or if the parameter is an object
                # reference the full object refernce (see the method).
                if value[param_id] is None:
                    target_val = None
                else:
                    target_val = resolve_ref_if_typed(value[param_id],
                                                      spec_params[param_id])

                mapped_value[target_key] = target_val
            return mapped_value

    def _map_inputs(self, input_mapping, params, spec_params):
        """
        Maps the dictionary of parameters and inputs based on rules provided in
        the input_mapping. This iterates over the list of input_mappings, and
        uses them as a filter to apply to each parameter.

        Returns a list of inputs that can be passed directly to NJSW.run_job

        input_mapping is a list of dicts, as defined by
        NarrativeMethodStore.ServiceMethodInputMapping.
        params is a dict of key-value-pairs, each key is the input_parameter
        field of some parameter.
        """
        inputs_dict = dict()
        for p in input_mapping:
            # 2 steps - figure out the proper value, then figure out the
            # proper position. value first!
            p_value = None
            input_param_id = None
            if 'input_parameter' in p:
                input_param_id = p['input_parameter']
                p_value = params.get(input_param_id, None)
                if spec_params[input_param_id].get('type', '') == 'group':
                    p_value = self._map_group_inputs(
                        p_value, spec_params[input_param_id], spec_params)
                # turn empty strings into None
                if isinstance(p_value, str) and len(p_value) == 0:
                    p_value = None
            elif 'narrative_system_variable' in p:
                p_value = system_variable(p['narrative_system_variable'])
            if 'constant_value' in p and p_value is None:
                p_value = p['constant_value']
            if 'generated_value' in p and p_value is None:
                p_value = self._generate_input(p['generated_value'])

            spec_param = None
            if input_param_id:
                spec_param = spec_params[input_param_id]
            p_value = transform_param_value(p.get('target_type_transform'),
                                            p_value, spec_param)

            # get position!
            arg_position = p.get('target_argument_position', 0)
            target_prop = p.get('target_property', None)
            if target_prop is not None:
                final_input = inputs_dict.get(arg_position, dict())
                if '/' in target_prop:
                    # This is case when slashes in target_prop separate
                    # elements in nested maps. We ignore escaped slashes
                    # (separate backslashes should be escaped as well).
                    bck_slash = "\u244A"
                    fwd_slash = "\u20EB"
                    temp_string = target_prop.replace("\\\\", bck_slash)
                    temp_string = temp_string.replace("\\/", fwd_slash)
                    temp_path = []
                    for part in temp_string.split("/"):
                        part = part.replace(bck_slash, "\\")
                        part = part.replace(fwd_slash, "/")
                        temp_path.append(
                            part.encode('ascii', 'ignore').decode("ascii"))
                    temp_map = final_input
                    temp_key = None
                    # We're going along the path and creating intermediate
                    # dictionaries.
                    for temp_path_item in temp_path:
                        if temp_key:
                            if temp_key not in temp_map:
                                temp_map[temp_key] = {}
                            temp_map = temp_map[temp_key]
                        temp_key = temp_path_item
                    # temp_map points to deepest nested map now, temp_key is
                    # the last item in the path
                    temp_map[temp_key] = p_value
                else:
                    final_input[target_prop] = p_value
                inputs_dict[arg_position] = final_input
            else:
                inputs_dict[arg_position] = p_value

        inputs_list = list()
        keys = sorted(inputs_dict.keys())
        for k in keys:
            inputs_list.append(inputs_dict[k])
        return inputs_list

    def _generate_input(self, generator):
        """
        Generates an input value using rules given by
        NarrativeMethodStore.AutoGeneratedValue.
        generator - dict
            has 3 optional properties:
            prefix - if present, is prepended to the generated string.
            symbols - if present is the number of symbols to autogenerate (if
                      not present, default=8)
            suffix - if present, is appended to the generated string.
        So, if generator is None or an empty dict, returns an 8-symbol string.
        """
        symbols = 8
        if 'symbols' in generator:
            try:
                symbols = int(generator['symbols'])
            except:
                raise ValueError(
                    'The "symbols" input to the generated value must be an ' +
                    'integer > 0!')
        if symbols < 1:
            raise ValueError(
                'Must have at least 1 symbol to randomly generate!')
        ret = ''.join(
            [chr(random.randrange(0, 26) + ord('A')) for _ in range(symbols)])
        if 'prefix' in generator:
            ret = str(generator['prefix']) + ret
        if 'suffix' in generator:
            ret = ret + str(generator['suffix'])
        return ret

    def _send_comm_message(self, msg_type, content):
        JobComm().send_comm_message(msg_type, content)

    def register_new_job(self, job: Job) -> None:
        JobManager().register_new_job(job)
        self._send_comm_message("new_job", {"job_id": job.job_id})
        JobComm().lookup_job_state(job.job_id)
        JobComm().start_job_status_loop()
Exemplo n.º 10
0
class JobManager(object):
    """
    The KBase Job Manager class. This handles all jobs and makes their status available.
    On status lookups, it feeds the results to the KBaseJobs channel that the front end
    listens to.
    """
    __instance = None

    # keys = job_id, values = { refresh = T/F, job = Job object }
    _running_jobs = dict()
    # keys = job_id, values = state from either Job object or NJS (these are identical)
    _completed_job_states = dict()

    _log = kblogging.get_logger(__name__)

    def __new__(cls):
        if JobManager.__instance is None:
            JobManager.__instance = object.__new__(cls)
        return JobManager.__instance

    def initialize_jobs(self):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params (also gets app_id)
        4. start the status lookup loop.
        """
        ws_id = system_variable("workspace_id")
        job_states = dict()
        kblogging.log_event(self._log, "JobManager.initialize_jobs",
                            {'ws_id': ws_id})
        try:
            job_states = clients.get('execution_engine2').check_workspace_jobs(
                {
                    'workspace_id': ws_id,
                    'return_list': 0
                })
            self._running_jobs = dict()
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            raise new_e

        for job_id, job_state in job_states.items():
            job_input = job_state.get('job_input', {})
            job_meta = job_input.get('narrative_cell_info', {})
            status = job_state.get('status')
            job = Job.from_state(job_id,
                                 job_input,
                                 job_state.get('user'),
                                 app_id=job_input.get('app_id'),
                                 tag=job_meta.get('tag', 'release'),
                                 cell_id=job_meta.get('cell_id', None),
                                 run_id=job_meta.get('run_id', None),
                                 token_id=job_meta.get('token_id', None),
                                 meta=job_meta)
            self._running_jobs[job_id] = {
                'refresh':
                1
                if status not in ['completed', 'errored', 'terminated'] else 0,
                'job':
                job
            }

    def _create_jobs(self, job_ids):
        """
        TODO: error handling
        Makes a bunch of Job objects from job_ids.
        Initially used to make Child jobs from some parent, but will eventually be adapted to all jobs on startup.
        Just slaps them all into _running_jobs
        """
        job_states = clients.get('execution_engine2').check_jobs({
            'job_ids':
            job_ids,
            'return_list':
            0
        })
        for job_id in job_ids:
            if job_id in job_ids and job_id not in self._running_jobs:
                job_state = job_states.get(job_id, {})
                user = job_state.get('user')
                job_info = job_state.get('job_input', {})
                job_meta = job_info.get('narrative_cell_info', {})
                job = Job.from_state(
                    job_id,  # the id
                    job_info,  # params, etc.
                    user,  # owner id
                    app_id=job_info.get('app_id', job_info.get('method')),
                    tag=job_meta.get('tag', 'release'),
                    cell_id=job_meta.get('cell_id', None),
                    run_id=job_meta.get('run_id', None),
                    token_id=job_meta.get('token_id', None),
                    meta=job_meta)

                # Note that when jobs for this narrative are initially loaded,
                # they are set to not be refreshed. Rather, if a client requests
                # updates via the start_job_update message, the refresh flag will
                # be set to True.
                self._running_jobs[job_id] = {'refresh': 0, 'job': job}

    def list_jobs(self):
        """
        List all job ids, their info, and status in a quick HTML format.
        """
        try:
            all_statuses = self.lookup_all_job_states(ignore_refresh_flag=True)
            state_list = [s["state"] for s in all_statuses.values()]

            if not len(state_list):
                return "No running jobs!"

            state_list = sorted(state_list, key=lambda s: s.get('created', 0))
            for state in state_list:
                job = self.get_job(state["job_id"])
                state['created'] = datetime.fromtimestamp(
                    state['created'] / 1000.0).strftime("%Y-%m-%d %H:%M:%S")
                state['run_time'] = 'Not started'
                state["owner"] = job.owner
                state["app_id"] = job.app_id
                exec_start = state.get('running', None)

                if state.get('finished'):
                    finished_time = datetime.fromtimestamp(
                        state.get('finished') / 1000.0)
                    state['finish_time'] = finished_time.strftime(
                        "%Y-%m-%d %H:%M:%S")
                    if exec_start:
                        exec_start_time = datetime.fromtimestamp(exec_start /
                                                                 1000.0)
                        delta = finished_time - exec_start_time
                        delta = delta - timedelta(
                            microseconds=delta.microseconds)
                        state['run_time'] = str(delta)
                elif exec_start:
                    exec_start_time = datetime.fromtimestamp(
                        exec_start / 1000.0).replace(tzinfo=timezone.utc)
                    delta = datetime.now(timezone.utc) - exec_start_time
                    delta = delta - timedelta(microseconds=delta.microseconds)
                    state['run_time'] = str(delta)

            tmpl = """
            <table class="table table-bordered table-striped table-condensed">
                <tr>
                    <th>Id</th>
                    <th>Name</th>
                    <th>Submitted</th>
                    <th>Submitted By</th>
                    <th>Status</th>
                    <th>Run Time</th>
                    <th>Complete Time</th>
                </tr>
                {% for j in jobs %}
                <tr>
                    <td>{{ j.job_id|e }}</td>
                    <td>{{ j.app_id|e }}</td>
                    <td>{{ j.created|e }}</td>
                    <td>{{ j.owner|e }}</td>
                    <td>{{ j.status|e }}</td>
                    <td>{{ j.run_time|e }}</td>
                    <td>{% if j.finish_time %}{{ j.finish_time|e }}{% else %}Incomplete{% endif %}</td>
                </tr>
                {% endfor %}
            </table>
            """
            return HTML(Template(tmpl).render(jobs=state_list))

        except Exception as e:
            kblogging.log_event(self._log, "list_jobs.error", {'err': str(e)})
            raise

    def _create_error_state(self,
                            error: str,
                            error_msg: str,
                            code: int,
                            cell_id=None,
                            run_id=None,
                            job_id=None) -> dict:
        """
        Creates an error state to return if
        1. the state is missing or unretrievable
        2. Job is none
        This creates the whole state dictionary to return, as described in
        _construct_job_status.
        :param error: the full, detailed error (not necessarily human-readable, maybe a stacktrace)
        :param error_msg: a shortened error string, meant to be human-readable
        :param code: int, an error code
        """
        return {
            "status": "error",
            "error": {
                "code": code,
                "name": "Job Error",
                "message": error_msg,
                "error": error
            },
            "errormsg": error_msg,
            "error_code": code,
            "cell_id": None,
            "run_id": None,
            "job_id": job_id,
            "cell_id": cell_id,
            "run_id": run_id,
            "created": 0,
            "updated": 0
        }

    def _construct_job_status(self, job: Job, state: dict) -> dict:
        """
        Creates a Job status dictionary with structure:
        {
            owner: string (username, who started the job),
            spec: app spec (optional)
            widget_info: (if not finished, None, else...) job.get_viewer_params result
            state: {
                job_id: string,
                status: string,
                created: epoch ms,
                updated: epoch ms,
                queued: optional - epoch ms,
                finished: optional - epoc ms,
                terminated_code: optional - int,
                tag: string (release, beta, dev),
                parent_job_id: optional - string or null,
                run_id: string,
                cell_id: string,
                errormsg: optional - string,
                error (optional): {
                    code: int,
                    name: string,
                    message: string (should be for the user to read),
                    error: string, (likely a stacktrace)
                },
                error_code: optional - int
            }
        }
        :param job: a Job object
        :param state: dict, expected to be in the format that comes straight from the
            Execution Engine 2 service
        """
        widget_info = None
        app_spec = {}

        # If there's no job, but the state is valid, then that (likely) means the job was started
        # by either running AppManager.run_app directly without cell_id or run_id info, or that
        # it was started outside of the biokbase.narrative.jobs setup. This could be done through
        # direct calls to EE2.
        #
        # This could also be triggered by manually looking up job state for some job that doesn't
        # exist in the Narrative. Which is borderline, but still probably ok.
        if job is None and state is not None:
            state.update({
                "cell_id": None,
                "run_id": None,
            })
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        if state is None:
            kblogging.log_event(self._log, "lookup_job_status.error", {
                'err':
                'Unable to get job state for job {}'.format(job.job_id)
            })
            state = self._create_error_state(
                "Unable to find current job state. Please try again later, or contact KBase.",
                "Unable to return job state",
                -1,
                cell_id=job.cell_id,
                run_id=job.run_id,
                job_id=job.job_id)

        if state.get('finished'):
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error",
                                    {'err': str(e)})
                state.update({
                    "status": "error",
                    "errormsg": "Unable to build output viewer parameters!",
                    "error": {
                        "code":
                        getattr(new_e, "code", -1),
                        "source":
                        getattr(new_e, "source", "JobManager"),
                        "name":
                        "App Error",
                        "message":
                        "Unable to build output viewer parameters",
                        "error":
                        "Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance."
                    }
                })

        state.update({
            "child_jobs":
            self._child_job_states(state.get("sub_jobs", []),
                                   job.meta.get("batch_app"),
                                   job.meta.get("batch_tag")),
            "run_id":
            job.cell_id,
            "cell_id":
            job.cell_id
        })
        if "batch_size" in job.meta:
            state.update({"batch_size": job.meta["batch_size"]})
        return {
            "state": state,
            "spec": app_spec,
            "widget_info": widget_info,
            "owner": job.owner,
            "listener_count": self._running_jobs[job.job_id]["refresh"]
        }

    def _child_job_states(self, sub_job_list, app_id, app_tag):
        """
        Fetches state for all jobs in the list. These are expected to be child jobs, with no actual Job object associated.
        So if they're done, we need to do the output mapping out of band.
        But the check_jobs call with params will return the app id. So that helps.

        app_id = the id of the app that all the child jobs are running (format: module/method, like "MEGAHIT/run_megahit")
        app_tag = one of "release", "beta", "dev"
        (the above two aren't stored with the subjob metadata, and won't until we back some more on KBParallel - I want to
        lobby for pushing toward just starting everything up at once from here and letting HTCondor deal with allocation)
        sub_job_list = list of ids of jobs to look up
        """
        if not sub_job_list:
            return []

        sub_job_list = sorted(sub_job_list)

        job_states = clients.get('execution_engine2').check_jobs({
            'job_ids':
            sub_job_list,
            'exclude_fields':
            EXCLUDED_JOB_STATE_FIELDS,
            'return_list':
            0
        })
        child_job_states = list()

        for job_id in sub_job_list:
            job_state = job_states.get(job_id, {})
            params = job_state.get('job_input', {}).get('params', [])
            # if it's error, get the error.
            if job_state.get('errormsg'):
                error = job_state
                error.update({'job_id': job_id})
                child_job_states.append(error)
                continue
            # if it's done, get the output mapping.
            state = job_state.get('status')
            if state == 'completed':
                try:
                    widget_info = Job.map_viewer_params(
                        state, params, app_id, app_tag)
                except ValueError:
                    widget_info = {}
                state.update({'widget_info': widget_info})
            child_job_states.append(state)
        return child_job_states

    def _construct_job_status_set(self, job_ids: list) -> dict:
        """
        Builds a set of job states for the list of job ids.
        """
        # if cached, use 'em.
        # otherwise, lookup.
        # do transform
        # cache terminal ones.
        # return all.
        if not isinstance(job_ids, list):
            raise ValueError("job_ids must be a list")
        if job_ids is None:
            job_ids = self._running_jobs.keys()

        job_states = dict()
        jobs_to_lookup = list()

        # Fetch from cache of terminated jobs, where available.
        # These are already post-processed and ready to return.
        for job_id in job_ids:
            if job_id in self._completed_job_states:
                job_states[job_id] = self._completed_job_states[job_id]
            else:
                jobs_to_lookup.append(job_id)

        fetched_states = dict()
        # Get the rest of states direct from EE2.
        if len(jobs_to_lookup):
            try:
                fetched_states = clients.get("execution_engine2").check_jobs({
                    "job_ids":
                    jobs_to_lookup,
                    "exclude_fields":
                    EXCLUDED_JOB_STATE_FIELDS,
                    "return_list":
                    0
                })
            except Exception as e:
                kblogging.log_event(self._log, "construct_job_status_set",
                                    {"err": str(e)})
        for job_id, state in fetched_states.items():
            revised_state = self._construct_job_status(self.get_job(job_id),
                                                       state)
            if revised_state["state"]["status"] in TERMINAL_STATES:
                self._completed_job_states[job_id] = revised_state
            job_states[job_id] = revised_state
        return job_states

    def _verify_job_parentage(self, parent_job_id, child_job_id):
        """
        Validate job relationships.
        1. Make sure parent exists, and the child id is in its list of sub jobs.
        2. If child doesn't exist, create it and add it to the list.
        If parent doesn't exist, or child isn't an actual child, raise an exception
        """
        if parent_job_id not in self._running_jobs:
            raise ValueError(
                'Parent job id {} not found, cannot validate child job {}.'.
                format(parent_job_id, child_job_id))
        if child_job_id not in self._running_jobs:
            parent_job = self.get_job(parent_job_id)
            parent_state = parent_job.state()
            if child_job_id not in parent_state.get('sub_jobs', []):
                raise ValueError(
                    'Child job id {} is not a child of parent job {}'.format(
                        child_job_id, parent_job_id))
            else:
                self._create_jobs([child_job_id])
                # injects its app id and version
                child_job = self.get_job(child_job_id)
                child_job.app_id = parent_job.meta.get('batch_app')
                child_job.tag = parent_job.meta.get('batch_tag', 'release')

    def lookup_job_info(self, job_id, parent_job_id=None):
        """
        Will raise a ValueError if job_id doesn't exist.
        Sends the info over the comm channel as this packet:
        {
            app_id: module/name,
            app_name: random string,
            job_id: string,
            job_params: dictionary
        }
        """
        # if parent_job is real, and job_id (the child) is not, just add it to the
        # list of running jobs and work as normal.
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        job = self.get_job(job_id)
        info = {
            'app_id': job.app_id,
            'app_name': job.app_spec()['info']['name'],
            'job_id': job_id,
            'job_params': job.inputs
        }
        return info

    def lookup_all_job_states(self, ignore_refresh_flag=False):
        """
        Fetches states for all running jobs.
        If ignore_refresh_flag is True, then returns states for all jobs this
        JobManager knows about (i.e. all jobs associated with the workspace).

        This returns them all as a dictionary, keyed on the job id.
        :param ignore_refresh_flag: boolean - if True, ignore the usual refresh state of the job.
            Even if the job is stopped, or completed, fetch and return its state from the service.
        """
        jobs_to_lookup = list()
        # grab the list of running job ids, so we don't run into update-while-iterating problems.
        for job_id in self._running_jobs.keys():
            if self._running_jobs[job_id]['refresh'] > 0 or ignore_refresh_flag:
                jobs_to_lookup.append(job_id)
        if len(jobs_to_lookup) > 0:
            return self._construct_job_status_set(jobs_to_lookup)
        else:
            return dict()

    def register_new_job(self, job: Job) -> None:
        """
        Registers a new Job with the manager - should only be invoked when a new Job gets
        started. This stores the Job locally and pushes it over the comm channel to the
        Narrative where it gets serialized.

        Parameters:
        -----------
        job : biokbase.narrative.jobs.job.Job object
            The new Job that was started.
        """
        kblogging.log_event(self._log, "register_new_job",
                            {"job_id": job.job_id})
        self._running_jobs[job.job_id] = {'job': job, 'refresh': 0}

    def get_job(self, job_id):
        """
        Returns a Job with the given job_id.
        Raises a ValueError if not found.
        """
        if job_id in self._running_jobs:
            return self._running_jobs[job_id]["job"]
        else:
            raise ValueError(f"No job present with id {job_id}")

    def get_job_logs(self,
                     job_id: str,
                     parent_job_id: str = None,
                     first_line: int = 0,
                     num_lines: int = None,
                     latest_only: bool = False) -> tuple:
        """
        Raises a Value error if the job_id doesn't exist or is not present.
        :param job_id: str - the job id from the execution engine
        :param parent_job_id: if the job is a child job, this is its parent (optional)
        :param first_line: int - the first line to be requested by the log. 0-indexed. If < 0,
            this will be set to 0
        :param max_lines: int - the maximum number of lines to return.
            if < 0, will be reset to 0.
            if None, then will not be considered, and just return all the lines.
        :param latest_only: bool - if True, will only return the most recent max_lines
            of logs. This overrides the first_line parameter if set to True. So if the call made
            is get_job_logs(id, first_line=0, num_lines=5, latest_only=True), and there are 100
            log lines available, then lines 96-100 will be returned.
        :returns: 3-tuple. elements in order:
            int - the first line returned
            int - the number of logs lines currently available for that job
            list - the lines themselves, fresh from the server. These are all tiny dicts
                with key "is_error" (either 0 or 1) and "line" - the log line string

        """
        job = self.get_job(job_id)

        if first_line < 0:
            first_line = 0
        if num_lines is not None and num_lines < 0:
            num_lines = 0

        try:
            if latest_only:
                (max_lines, logs) = job.log()
                if num_lines is not None and max_lines > num_lines:
                    first_line = max_lines - num_lines
                    logs = logs[first_line:]
            else:
                (max_lines, logs) = job.log(first_line=first_line,
                                            num_lines=num_lines)

            return (first_line, max_lines, logs)
        except Exception as e:
            raise transform_job_exception(e)

    def cancel_job(self, job_id: str, parent_job_id: str = None) -> None:
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        if the job_id is None or not found in this Narrative, a ValueError is raised.
        This then checks the job to see if it is already canceled/finished,
        then attempts to cancel it.
        If either of those steps fail, a NarrativeException is raised.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if not parent_job_id and job_id not in self._running_jobs:
            raise ValueError(f"No job present with id {job_id}")

        try:
            cancel_status = clients.get(
                "execution_engine2").check_job_canceled({"job_id": job_id})
            if cancel_status.get("finished", 0) == 1 or cancel_status.get(
                    "canceled", 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise transform_job_exception(e)

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        if not parent_job_id:
            is_refreshing = self._running_jobs[job_id].get('refresh', 0)
            self._running_jobs[job_id]['refresh'] = 0
            self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('execution_engine2').cancel_job({'job_id': job_id})
        except Exception as e:
            raise transform_job_exception(e)
        finally:
            if not parent_job_id:
                self._running_jobs[job_id]['refresh'] = is_refreshing
                del self._running_jobs[job_id]['canceling']

    def get_job_state(self, job_id: str, parent_job_id: str = None) -> dict:
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        if job_id is None or job_id not in self._running_jobs:
            raise ValueError(f"No job present with id {job_id}")
        if job_id in self._completed_job_states:
            return self._completed_job_states[job_id]
        job = self.get_job(job_id)
        state = self._construct_job_status(job, job.state())
        if state.get('status') == 'completed':
            self._completed_job_states[job_id] = state
        return state

    def modify_job_refresh(self,
                           job_id: str,
                           update_adjust: int,
                           parent_job_id: str = None) -> None:
        """
        Modifies how many things want to get the job updated.
        If this sets the current "refresh" key to be less than 0, it gets reset to 0.
        If the job isn't present or None, a ValueError is raised.
        """
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        if job_id is None or job_id not in self._running_jobs:
            raise ValueError(f"No job present with id {job_id}")
        self._running_jobs[job_id]["refresh"] += update_adjust
        if self._running_jobs[job_id]["refresh"] < 0:
            self._running_jobs[job_id]["refresh"] = 0
Exemplo n.º 11
0
class AppManager(object):
    """
    The main class for managing how KBase apps get run. This contains functions
    for showing app descriptions, their usage (how to invoke various
    parameters), and, ultimately, for running the app.

    A typical flow might be like this.
    am = AppManager()
    am.available_apps()
        # show the set of apps with a brief description of each.
    am.app_usage(app_id)
        # show how to use a app and set its parameters.
    job = am.run_app(app_id, input1=value1, input2=value2, ...)
        # run an app with given inputs.
    """

    __instance = None

    __MAX_TOKEN_NAME_LEN = 30

    spec_manager = specmanager.SpecManager()
    _log = kblogging.get_logger(__name__)
    _comm = None
    viewer_count = 1

    def __new__(cls):
        if AppManager.__instance is None:
            AppManager.__instance = object.__new__(cls)
            AppManager.__instance._comm = None
        return AppManager.__instance

    def reload(self):
        """
        Reloads all app specs into memory from the App Catalog.
        Any outputs of app_usage, app_description, or available_apps
        should be run again after the update.
        """
        self.spec_manager.reload()

    def app_usage(self, app_id, tag="release"):
        """
        This shows the list of inputs and outputs for a given app with a given
        tag. By default, this is done in a pretty HTML way, but this app can be
        wrapped in str() to show a bare formatted string.

        If either the app_id is unknown, or isn't found with the given release
        tag, or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev
            (default=release)
        """
        return self.spec_manager.app_usage(app_id, tag)

    def app_description(self, app_id, tag="release"):
        """
        Returns the app description in a printable HTML format.

        If either the app_id is unknown, or isn't found with the given release
        tag, or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev
            (default=release)
        """
        return self.spec_manager.app_description(app_id, tag)

    def available_apps(self, tag="release"):
        """
        Lists the set of available apps for a given tag in a simple table.
        If the tag is not found, a ValueError will be raised.

        Parameters:
        -----------
        tag : Which version of the list of apps to view - either release, beta,
            or dev (default=release)

        """
        return self.spec_manager.available_apps(tag)

    @_app_error_wrapper
    def run_legacy_batch_app(
        self,
        app_id,
        params,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
        dry_run=False,
    ):
        if params is None:
            params = []
        ws_id = strict_system_variable("workspace_id")
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        # A list of lists of UPAs, used for each subjob.
        batch_ws_upas = []
        # The list of actual input values, post-mapping.
        batch_run_inputs = []

        for param_set in params:
            spec_params_map = dict(
                (spec_params[i]["id"], spec_params[i]) for i in range(len(spec_params))
            )
            batch_ws_upas.append(extract_ws_refs(app_id, tag, spec_params, param_set))
            batch_run_inputs.append(
                self._map_inputs(
                    spec["behavior"]["kb_service_input_mapping"],
                    param_set,
                    spec_params_map,
                )
            )

        service_method = spec["behavior"]["kb_service_method"]
        service_name = spec["behavior"]["kb_service_name"]
        service_ver = spec["behavior"].get("kb_service_version", None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        job_meta = {
            "tag": BATCH_APP["TAG"],
            "batch_app": app_id,
            "batch_tag": tag,
            "batch_size": len(params),
        }
        if cell_id is not None:
            job_meta["cell_id"] = cell_id
        if run_id is not None:
            job_meta["run_id"] = run_id

        # Now put these all together in a way that can be sent to the batch processing app.
        batch_params = [
            {
                "module_name": service_name,
                "method_name": service_method,
                "service_ver": service_ver,
                "wsid": ws_id,
                "meta": job_meta,
                "batch_params": [
                    {
                        "params": batch_run_inputs[i],
                        "source_ws_objects": batch_ws_upas[i],
                    }
                    for i in range(len(batch_run_inputs))
                ],
            }
        ]

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(app_id)
        job_meta["token_id"] = agent_token["id"]

        # This is the input set for ee2.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            "app_id": BATCH_APP["APP_ID"],
            "meta": job_meta,
            "method": BATCH_APP["METHOD"],
            "params": batch_params,
            "service_ver": BATCH_APP["VERSION"],
            "wsid": ws_id,
        }

        # if we're doing a dry run, just return the inputs that we made.
        if dry_run:
            return job_runner_inputs

        # Log that we're trying to run a job...
        log_info = {
            "app_id": app_id,
            "tag": BATCH_APP["TAG"],
            "version": service_ver,
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_batch_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_batch_app_error", log_info)
            raise transform_job_exception(e) from e

        new_job = Job.from_job_id(
            job_id,
            extra_data={
                # this data is not preserved in the ee2 record
                "batch_app": app_id,
                "batch_tag": tag,
                "batch_size": len(params),
            },
        )

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "job_id": job_id,
            },
        )
        JobManager().register_new_job(new_job, refresh=False)
        if cell_id is None:
            return new_job

    @_app_error_wrapper
    def run_app(
        self,
        app_id,
        params,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
        dry_run=False,
    ):
        """
        Attempts to run the app, returns a Job with the running app info.
        If this is given a cell_id, then returns None. If not, it returns the
        generated Job object.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - this is the dictionary of parameters to tbe used with the app.
                 They can be found by using the app_usage function. If any
                 non-optional apps are missing, a ValueError will be raised.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.

        Example:
        --------
        run_app('MegaHit/run_megahit',
                {
                    'read_library_name' : 'My_PE_Library',
                    'output_contigset_name' : 'My_Contig_Assembly'
                },
                version='>=1.0.0'
        )
        """
        if params is None:
            params = {}
        ws_id = strict_system_variable("workspace_id")
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        job_runner_inputs = self._build_run_job_params(
            spec, tag, params, version, cell_id, run_id, ws_id
        )

        if dry_run:
            return job_runner_inputs

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(app_id)
        job_runner_inputs["meta"]["token_id"] = agent_token["id"]

        # Log that we're trying to run a job...
        log_info = {
            "app_id": app_id,
            "tag": tag,
            "version": job_runner_inputs["service_ver"],
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e) from e

        new_job = Job.from_job_id(job_id)

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "job_id": job_id,
            },
        )
        JobManager().register_new_job(new_job, refresh=False)
        if cell_id is not None:
            return
        else:
            return new_job

    @_app_error_wrapper
    def run_app_batch(
        self,
        app_info: list,
        cell_id: str = None,
        run_id: str = None,
        dry_run: bool = False,
    ) -> Union[dict, None]:
        """
        Attempts to run a batch of apps in bulk using the Execution Engine's run_app_batch endpoint.
        If a cell_id is provided, this sends various job messages over the comm channel, and returns None.
        If dry_run is True, this returns the structure that would be sent to EE2.run_job_batch

        Parameters:
        -----------
        app_info: this is a list of app information dictionaries. It's broken down such that a single app
            can have multiple sets of parameters, which could create multiple runs of that app.
            Each dictionary is expected to have the following keys:
            app_id: the id of the app to run
            tag: the app tag to run, one of release, beta, or dev
            version: (optional) the specified version to run, if not provided, this will be the most recent
                for that particular tag
            shared_params: (optional) any params to be shared by all runs of the app
            params: a list of at least one dictionary. Each dict contains the set of parameters to run the
                app once.
        cell_id: if provided, this should be a unique id for the Narrative cell that's running the app.
        run_id: if provided, this should be a unique id representing a Narrative cell's knowledge of
            that job.
        dry_run: if True, this won't start the job, but return the structure that would be sent to the
            KBase execution engine.

        Example:
        --------
        run_app_batch([{
            "app_id": "Some_module/reads_to_contigset",
            "tag": "release",
            "version": "1.0.0",
            "shared_params": {
                "filter_len": 500
            },
            "params": [
                {
                    "read_library_name" : "My_PE_Library",
                    "output_contigset_name" : "My_Contig_Assembly"
                }, {
                    "read_library_name": "Another_reads_library",
                    "output_contigset_name": "Another_contig_assembly"
                }
            ]
        }, {
            "app_id": "Some_module/contigset_to_genome",
            "tag": "release",
            "version": "1.1.0",
            "shared_params": {
                "filter_len": 1000,
                "taxon_id": 121212
            },
            "params": [
                {
                    "contigset": "My_contigset",
                    "genome_name": "My_genome"
                }
            ]
        }])
        """

        if not isinstance(app_info, list) or len(app_info) == 0:
            raise ValueError(
                "app_info must be a list with at least one set of app information"
            )
        batch_run_inputs = []
        ws_id = strict_system_variable("workspace_id")
        batch_params = {"wsid": ws_id}  # for EE2.run_job_batch
        log_app_info = []
        for info in app_info:
            self._validate_bulk_app_info(info)
            self._reconstitute_shared_params(info)
            app_id = info["app_id"]
            tag = info.get("tag", "release")
            version = info.get("version")
            spec = self._get_validated_app_spec(app_id, tag, True, version)
            for param_set in info["params"]:
                # will raise a ValueError if anything is wrong or missing
                # otherwise, will build a set of inputs for EE2.run_job
                batch_run_inputs.append(
                    self._build_run_job_params(
                        spec,
                        tag,
                        param_set,
                        version=version,
                        cell_id=cell_id,
                        run_id=run_id,
                    )
                )
            log_app_info.append(
                {
                    "app_id": app_id,
                    "tag": tag,
                    "version": version,
                    "num_jobs": len(batch_run_inputs),
                }
            )
        log_info = {
            "app_info": log_app_info,
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_app_batch", log_info)

        # if we're doing a dry run, stop here and return the setup
        if dry_run:
            return {"batch_run_params": batch_run_inputs, "batch_params": batch_params}

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(
            f"KBase_app_batch_{len(batch_run_inputs)}_apps"
        )

        # add the token id to the meta for all jobs
        for job_input in batch_run_inputs:
            job_input["meta"]["token_id"] = agent_token["id"]

        # run the job batch and get a batch_submission record
        try:
            batch_submission = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job_batch(batch_run_inputs, batch_params)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_job_bulk_error", log_info)
            raise transform_job_exception(e) from e

        batch_id = batch_submission["batch_id"]
        child_ids = batch_submission["child_job_ids"]

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job_batch",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "batch_id": batch_id,
                "child_job_ids": child_ids,
            },
        )

        child_jobs = Job.from_job_ids(child_ids, return_list=True)
        parent_job = Job.from_job_id(
            batch_id,
            children=child_jobs,
        )

        # TODO make a tighter design in the job manager for submitting a family of jobs
        for new_job in child_jobs:
            JobManager().register_new_job(new_job, refresh=False)
        JobManager().register_new_job(parent_job, refresh=False)

        if cell_id is None:
            return {"parent_job": parent_job, "child_jobs": child_jobs}

    def _validate_bulk_app_info(self, app_info: dict):
        """
        Validation consists of:
        1. must have "app_id" with format xyz/abc
        2. must have "tag" with "release, beta, dev" options
        3. optionally have "version" that's a string
        4. must have "params" that's a list of at least one dict.
        """
        malformed_params_error = "params must be a list of dicts of app parameters"

        # make sure we have all required keys
        required_keys = ["app_id", "tag", "params"]
        for key in required_keys:
            if key not in app_info:
                raise ValueError(
                    f"app info must contain keys {', '.join(required_keys)}"
                )
        # make sure app is of the form "module/app"
        if (
            not isinstance(app_info["app_id"], str)
            or re.match(r"\S+\/\S+", app_info["app_id"]) is None
        ):
            raise ValueError("an app_id must be of the format module_name/app_name")
        # params must be a list with at least one item (even an empty dict)
        if not isinstance(app_info["params"], list) or len(app_info["params"]) == 0:
            raise ValueError(malformed_params_error)
        # each item must be a dict
        for params in app_info["params"]:
            if not isinstance(params, dict):
                raise ValueError(malformed_params_error)
        # make sure tag is an allowed item
        allowed_tags = ["release", "beta", "dev"]
        if app_info["tag"] not in allowed_tags:
            raise ValueError(
                f"tag must be one of {', '.join(allowed_tags)}, not {app_info['tag']}"
            )
        # make sure version is a string, if present
        if "version" in app_info and not isinstance(app_info["version"], str):
            raise ValueError(
                f"an app version must be a string, not {app_info['version']}"
            )

    def _reconstitute_shared_params(self, app_info_el: dict) -> None:
        """
        Mutate each params dict to include any shared_params
        app_info_el is structured like:
        {
            "app_id": "Some_module/reads_to_contigset",
            "tag": "release",
            "version": "1.0.0",
            "shared_params": {
                "filter_len": 500
            },
            "params": [
                {
                    "read_library_name" : "My_PE_Library",
                    "output_contigset_name" : "My_Contig_Assembly"
                }, {
                    "read_library_name": "Another_reads_library",
                    "output_contigset_name": "Another_contig_assembly"
                }
            ]
        }
        """
        if "shared_params" in app_info_el:
            shared_params = app_info_el.pop("shared_params")
            for param_set in app_info_el["params"]:
                for k, v in shared_params.items():
                    param_set.setdefault(k, v)

    def _build_run_job_params(
        self,
        spec: dict,
        tag: str,
        param_set: dict,
        version: str = None,
        cell_id: str = None,
        run_id: str = None,
        ws_id: int = None,
    ) -> dict:
        """
        Builds the set of inputs for EE2.run_job and EE2.run_job_batch (RunJobParams) given a spec
        and set of inputs/parameters.

        Parameters:
        -----------
        spec: dict, an app spec
        tag: str, one of release, beta, dev
        param_set: dict, key-value pairs for each app input and parameter
        version: str, should be either a semantic version or git hash for the app to run
        cell_id: str, the cell id to associate with the job
        run_id: str, the run id to associate with the job
        ws_id: int, the workspace id to associate with the job

        Returns:
        --------
        This returns a dict with the following keys:
            method: the function to run
            service_ver: the version of the app to run
            params: the set of inputs, mapped to what the method expects to see
            app_id: the original id of the app to run
            wsid: the workspace id associated with the new job
            source_ws_objects: the UPAs for any workspace objects involved with running the app, if any
            meta: key-value pairs, usually containing:
                cell_id (if not None),
                run_id (if not None),
                tag
        """
        # get the app id from the spec
        app_id = spec["info"]["id"]

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        spec_params_map = dict(
            (spec_params[i]["id"], spec_params[i]) for i in range(len(spec_params))
        )
        ws_input_refs = extract_ws_refs(app_id, tag, spec_params, param_set)
        input_vals = self._map_inputs(
            spec["behavior"]["kb_service_input_mapping"], param_set, spec_params_map
        )

        service_method = spec["behavior"]["kb_service_method"]
        service_name = spec["behavior"]["kb_service_name"]
        service_ver = spec["behavior"].get("kb_service_version", None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + "." + service_method
        job_meta = {"tag": tag}
        if cell_id is not None:
            job_meta["cell_id"] = cell_id
        if run_id is not None:
            job_meta["run_id"] = run_id

        # This is the input set for EE2.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            "method": function_name,
            "service_ver": service_ver,
            "params": input_vals,
            "app_id": app_id,
            "meta": job_meta,
        }
        if ws_id is not None:
            job_runner_inputs["wsid"] = ws_id
        if len(ws_input_refs) > 0:
            job_runner_inputs["source_ws_objects"] = ws_input_refs

        return job_runner_inputs

    @_app_error_wrapper
    def run_local_app(
        self,
        app_id,
        params,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
        widget_state=None,
    ):
        """
        Attempts to run a local app. These do not return a Job object, but just
        the result of the app. In most cases, this will be a Javascript display
        of the result, but could be anything.

        If the app_spec looks like it makes a service call, then this raises a
        ValueError. Otherwise, it validates each parameter in **kwargs against
        the app spec, executes it, and returns the result.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'view_expression_profile'
        params - the dictionary of parameters for the app. Should be key-value
                 pairs where they keys are strings. If any non-optional
                 parameters are missing, an informative string will be printed.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.

        Example:
        run_local_app('NarrativeViewers/view_expression_profile',
                      {
                          "input_expression_matrix": "MyMatrix",
                          "input_gene_ids": "1234"
                      },
                      version='0.0.1',
                      input_expression_matrix="MyMatrix")
        """
        spec = self._get_validated_app_spec(app_id, tag, False, version=version)

        # Here, we just deal with two behaviors:
        # 1. None of the above - it's a viewer.
        # 2. ***TODO*** python_class / python_function.
        #    Import and exec the python code.

        # for now, just map the inputs to outputs.
        # First, validate.
        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)
        (params, ws_refs) = validate_parameters(app_id, tag, spec_params, params)

        # Log that we're trying to run a job...
        log_info = {
            "app_id": app_id,
            "tag": tag,
            "username": system_variable("user_id"),
            "ws": system_variable("workspace"),
        }
        kblogging.log_event(self._log, "run_local_app", log_info)

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "success",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
            },
        )

        (output_widget, widget_params) = map_outputs_from_state([], params, spec)

        # All a local app does is route the inputs to outputs through the
        # spec's mapping, and then feed that into the specified output widget.
        wm = WidgetManager()
        if widget_state is not None:
            return wm.show_advanced_viewer_widget(
                output_widget, widget_params, widget_state, cell_id=cell_id, tag=tag
            )
        else:
            return wm.show_output_widget(
                output_widget, widget_params, cell_id=cell_id, tag=tag
            )

    def run_local_app_advanced(
        self,
        app_id,
        params,
        widget_state,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
    ):
        return self.run_local_app(
            app_id,
            params,
            widget_state=widget_state,
            tag=tag,
            version=version,
            cell_id=cell_id,
            run_id=run_id,
        )

    def _get_validated_app_spec(self, app_id, tag, is_long, version=None):
        if (
            version is not None
            and tag != "release"
            and re.match(r"\d+\.\d+\.\d+", version) is not None
        ):
            raise ValueError(
                "Semantic versions only apply to released app modules. "
                + "You can use a Git commit hash instead to specify a "
                + "version."
            )
        self.spec_manager.check_app(app_id, tag, raise_exception=True)
        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)
        if "behavior" not in spec:
            raise ValueError("This app appears invalid - it has no defined behavior")
        if "script_module" in spec["behavior"] or "script_name" in spec["behavior"]:
            # It's an old NJS script. These don't work anymore.
            raise ValueError(
                "This app relies on a service that is now obsolete. Please contact "
                + "the administrator."
            )
        if is_long and "kb_service_input_mapping" not in spec["behavior"]:
            raise ValueError(
                "This app does not appear to be a long-running "
                + "job! Please use 'run_local_app' to start this "
                + "instead."
            )
        return spec

    def _map_group_inputs(self, value, spec_param, spec_params):
        if isinstance(value, list):
            return [self._map_group_inputs(v, spec_param, spec_params) for v in value]
        elif value is None:
            return None
        else:
            mapped_value = {}
            id_map = spec_param.get("id_mapping", {})
            for param_id in id_map:
                # ensure that the param referenced in the group param list
                # exists in the spec.
                # NB: This should really never happen if the sdk registration
                # process validates them.
                if param_id not in spec_params:
                    msg = "Unknown parameter id in group mapping: " + param_id
                    raise ValueError(msg)
            for param_id in value:
                target_key = id_map.get(param_id, param_id)
                # Sets either the raw value, or if the parameter is an object
                # reference the full object refernce (see the method).
                if value[param_id] is None:
                    target_val = None
                else:
                    target_val = resolve_ref_if_typed(
                        value[param_id], spec_params[param_id]
                    )

                mapped_value[target_key] = target_val
            return mapped_value

    def _map_inputs(self, input_mapping, params, spec_params):
        """
        Maps the dictionary of parameters and inputs based on rules provided in
        the input_mapping. This iterates over the list of input_mappings, and
        uses them as a filter to apply to each parameter.

        Returns a list of inputs that can be passed directly to NJSW.run_job

        input_mapping is a list of dicts, as defined by
        NarrativeMethodStore.ServiceMethodInputMapping.
        params is a dict of key-value-pairs, each key is the input_parameter
        field of some parameter.
        """
        inputs_dict = {}
        for p in input_mapping:
            # 2 steps - figure out the proper value, then figure out the
            # proper position. value first!
            p_value = None
            input_param_id = None
            if "input_parameter" in p:
                input_param_id = p["input_parameter"]
                p_value = params.get(input_param_id, None)
                if spec_params[input_param_id].get("type", "") == "group":
                    p_value = self._map_group_inputs(
                        p_value, spec_params[input_param_id], spec_params
                    )
                # turn empty strings into None
                if isinstance(p_value, str) and len(p_value) == 0:
                    p_value = None
            elif "narrative_system_variable" in p:
                p_value = system_variable(p["narrative_system_variable"])
            if "constant_value" in p and p_value is None:
                p_value = p["constant_value"]
            if "generated_value" in p and p_value is None:
                p_value = self._generate_input(p["generated_value"])

            spec_param = None
            if input_param_id:
                spec_param = spec_params[input_param_id]
            p_value = transform_param_value(
                p.get("target_type_transform"), p_value, spec_param
            )

            # get position!
            arg_position = p.get("target_argument_position", 0)
            target_prop = p.get("target_property", None)
            if target_prop is not None:
                final_input = inputs_dict.get(arg_position, {})
                if "/" in target_prop:
                    # This is case when slashes in target_prop separate
                    # elements in nested maps. We ignore escaped slashes
                    # (separate backslashes should be escaped as well).
                    bck_slash = "\u244A"
                    fwd_slash = "\u20EB"
                    temp_string = target_prop.replace("\\\\", bck_slash)
                    temp_string = temp_string.replace("\\/", fwd_slash)
                    temp_path = []
                    for part in temp_string.split("/"):
                        part = part.replace(bck_slash, "\\")
                        part = part.replace(fwd_slash, "/")
                        temp_path.append(part.encode("ascii", "ignore").decode("ascii"))
                    temp_map = final_input
                    temp_key = None
                    # We're going along the path and creating intermediate
                    # dictionaries.
                    for temp_path_item in temp_path:
                        if temp_key:
                            if temp_key not in temp_map:
                                temp_map[temp_key] = {}
                            temp_map = temp_map[temp_key]
                        temp_key = temp_path_item
                    # temp_map points to deepest nested map now, temp_key is
                    # the last item in the path
                    temp_map[temp_key] = p_value
                else:
                    final_input[target_prop] = p_value
                inputs_dict[arg_position] = final_input
            else:
                inputs_dict[arg_position] = p_value

        inputs_list = []
        keys = sorted(inputs_dict.keys())
        for k in keys:
            inputs_list.append(inputs_dict[k])
        return inputs_list

    def _generate_input(self, generator):
        """
        Generates an input value using rules given by
        NarrativeMethodStore.AutoGeneratedValue.
        generator - dict
            has 3 optional properties:
            prefix - if present, is prepended to the generated string.
            symbols - if present is the number of symbols to autogenerate (if
                      not present, default=8)
            suffix - if present, is appended to the generated string.
        So, if generator is None or an empty dict, returns an 8-symbol string.
        """
        symbols = 8
        if "symbols" in generator:
            try:
                symbols = int(generator["symbols"])
            except BaseException:
                raise ValueError(
                    'The "symbols" input to the generated value must be an '
                    + "integer > 0!"
                ) from None
        if symbols < 1:
            raise ValueError("Must have at least 1 symbol to randomly generate!")
        ret = "".join([chr(random.randrange(0, 26) + ord("A")) for _ in range(symbols)])
        if "prefix" in generator:
            ret = str(generator["prefix"]) + ret
        if "suffix" in generator:
            ret = ret + str(generator["suffix"])
        return ret

    def _send_comm_message(self, msg_type, content):
        JobComm().send_comm_message(msg_type, content)

    def _get_agent_token(self, name: str) -> Dict[str, str]:
        """
        Retrieves an agent token from the Auth service with a formatted name.
        This prepends "KBApp_" to the name for filtering, and trims to make sure the name
        isn't longer than it should be.
        """
        token_name = f"KBApp_{name}"
        token_name = token_name[: self.__MAX_TOKEN_NAME_LEN]
        return auth.get_agent_token(auth.get_auth_token(), token_name=token_name)
Exemplo n.º 12
0
class JobManager(object):
    """
    The KBase Job Manager clsas. This handles all jobs and makes their status available.
    On status lookups, it feeds the results to the KBaseJobs channel that the front end
    listens to.
    """
    __instance = None

    # keys = job_id, values = { refresh = T/F, job = Job object }
    _running_jobs = dict()

    _lookup_timer = None
    _comm = None
    _log = kblogging.get_logger(__name__)
    # TODO: should this not be done globally?
    _running_lookup_loop = False

    def __new__(cls):
        if JobManager.__instance is None:
            JobManager.__instance = object.__new__(cls)
        return JobManager.__instance

    def initialize_jobs(self):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params on each of those (also gets app_id)
        4. start the status lookup loop.
        """

        ws_id = system_variable('workspace_id')
        try:
            nar_jobs = clients.get('user_and_job_state').list_jobs2({
                'authstrat': 'kbaseworkspace',
                'authparams': [str(ws_id)]
            })
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get initial jobs list',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'service': 'user_and_job_state'
            }
            self._send_comm_message('job_init_err', error)
            raise new_e

        for info in nar_jobs:
            job_id = info[0]
            user_info = info[1]
            job_meta = info[10]
            try:
                job_info = clients.get('job_service').get_job_params(job_id)[0]

                self._running_jobs[job_id] = {
                    'refresh': True,
                    'job': Job.from_state(job_id,
                                          job_info,
                                          user_info[0],
                                          app_id=job_info.get('app_id'),
                                          tag=job_meta.get('tag', 'release'),
                                          cell_id=job_meta.get('cell_id', None),
                                          run_id=job_meta.get('run_id', None))
                }
                
            except Exception as e:
                kblogging.log_event(self._log, 'init_error', {'err': str(e)})
                new_e = transform_job_exception(e)
                error = {
                    'error': 'Unable to get job info on initial lookup',
                    'job_id': job_id,
                    'message': getattr(new_e, 'message', 'Unknown reason'),
                    'code': getattr(new_e, 'code', -1),
                    'source': getattr(new_e, 'source', 'jobmanager'),
                    'name': getattr(new_e, 'name', type(e).__name__),
                    'service': 'job_service'
                }
                self._send_comm_message('job_init_lookup_err', error)
                raise new_e # should crash and burn on any of these.

        if not self._running_lookup_loop:
            # only keep one loop at a time in cause this gets called again!
            if self._lookup_timer is not None:
                self._lookup_timer.cancel()
            self._running_lookup_loop = True
            self._lookup_job_status_loop()
        else:
            self._lookup_all_job_status()

    def list_jobs(self):
        """
        List all job ids, their info, and status in a quick HTML format.
        """
        try:
            status_set = list()
            for job_id in self._running_jobs:
                job = self._running_jobs[job_id]['job']
                job_state = job.state()
                job_params = job.parameters()
                job_state['app_id'] = job_params[0].get('app_id', 'Unknown App')
                job_state['owner'] = job.owner
                status_set.append(job_state)
            if not len(status_set):
                return "No running jobs!"
            status_set = sorted(status_set, key=lambda s: s['creation_time'])
            for i in range(len(status_set)):
                status_set[i]['creation_time'] = datetime.datetime.strftime(datetime.datetime.fromtimestamp(status_set[i]['creation_time']/1000), "%Y-%m-%d %H:%M:%S")
                exec_start = status_set[i].get('exec_start_time', None)
                if 'finish_time' in status_set[i]:
                    finished = status_set[i].get('finish_time', None)
                    if finished is not None and exec_start:
                        delta = datetime.datetime.fromtimestamp(finished/1000.0) - datetime.datetime.fromtimestamp(exec_start/1000.0)
                        delta = delta - datetime.timedelta(microseconds=delta.microseconds)
                        status_set[i]['run_time'] = str(delta)
                        status_set[i]['finish_time'] = datetime.datetime.strftime(datetime.datetime.fromtimestamp(status_set[i]['finish_time']/1000), "%Y-%m-%d %H:%M:%S")
                elif exec_start:
                    delta = datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(exec_start/1000.0)
                    delta = delta - datetime.timedelta(microseconds=delta.microseconds)
                    status_set[i]['run_time'] = str(delta)
                else:
                    status_set[i]['run_time'] = 'Not started'

            tmpl = """
            <table class="table table-bordered table-striped table-condensed">
                <tr>
                    <th>Id</th>
                    <th>Name</th>
                    <th>Submitted</th>
                    <th>Submitted By</th>
                    <th>Status</th>
                    <th>Run Time</th>
                    <th>Complete Time</th>
                </tr>
                {% for j in jobs %}
                <tr>
                    <td>{{ j.job_id|e }}</td>
                    <td>{{ j.app_id|e }}</td>
                    <td>{{ j.creation_time|e }}</td>
                    <td>{{ j.owner|e }}</td>
                    <td>{{ j.job_state|e }}</td>
                    <td>{{ j.run_time|e }}</td>
                    <td>{% if j.finish_time %}{{ j.finish_time|e }}{% else %}Incomplete{% endif %}</td>
                </tr>
                {% endfor %}
            </table>
            """
            return HTML(Template(tmpl).render(jobs=status_set))

        except Exception as e:
            kblogging.log_event(self._log, "list_jobs.error", {'err': str(e)})
            raise

    def get_jobs_list(self):
        """
        A convenience method for fetching an unordered list of all running Jobs.
        """
        return [j['job'] for j in self._running_jobs.values()]

    # def _get_existing_job(self, job_tuple):
    #     """
    #     creates a Job object from a job_id that already exists.
    #     If no job exists, raises an Exception.

    #     Parameters:
    #     -----------
    #     job_tuple : The expected 5-tuple representing a Job. The format is:
    #         (job_id, set of job inputs (as JSON), version tag, cell id that started the job, run id of the job)
    #     """

    #     # remove the prefix (if present) and take the last element in the split
    #     job_id = job_tuple[0].split(':')[-1]
    #     try:
    #         job_info = clients.get('job_service').get_job_params(job_id)[0]
    #         return Job.from_state(job_id, job_info, app_id=job_tuple[1], tag=job_tuple[2], cell_id=job_tuple[3], run_id=job_tuple[4])
    #     except Exception as e:
    #         kblogging.log_event(self._log, "get_existing_job.error", {'job_id': job_id, 'err': str(e)})
    #         raise

    def _construct_job_status(self, job_id):
        """
        Always creates a Job Status.
        It'll embed error messages into the status if there are problems.
        """

        state = {}
        widget_info = None
        app_spec = {}

        job = self.get_job(job_id)
        if job is None:
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Job does not seem to exist, or it is otherwise unavailable.',
                    'message': 'Job does not exist',
                    'name': 'Job Error',
                    'code': -1,
                    'exception': {
                        'error_message': 'job not found in JobManager',
                        'error_type': 'ValueError',
                        'error_stacktrace': ''
                    }
                },
                'cell_id': None,
                'run_id': None
            }
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        try:
            app_spec = job.app_spec()
        except Exception as e:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

        try:
            state = job.state()
        except Exception as e:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

            new_e = transform_job_exception(e)
            e_type = type(e).__name__
            e_message = str(new_e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace('>', '&gt;')
            e_code = getattr(new_e, "code", -2)
            e_source = getattr(new_e, "source", "JobManager")

            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to find current job state. Please try again later, or contact KBase.',
                    'message': 'Unable to return job state',
                    'name': 'Job Error',
                    'code': e_code,
                    'source': e_source,
                    'exception': {
                        'error_message': e_message,
                        'error_type': e_type,
                        'error_stacktrace': e_trace,
                    }
                },
                'creation_time': 0,
                'cell_id': job.cell_id,
                'run_id': job.run_id,
                'job_id': job_id
            }

        if state.get('finished', 0) == 1:
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})
                state['job_state'] = 'error'
                state['error'] = {
                    'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.',
                    'message': 'Unable to build output viewer parameters!',
                    'name': 'App Error',
                    'code': getattr(new_e, "code", -1),
                    'source': getattr(new_e, "source", "JobManager")
                }

        if 'canceling' in self._running_jobs[job_id]:
            state['job_state'] = 'canceling'

        return {'state': state,
                'spec': app_spec,
                'widget_info': widget_info,
                'owner': job.owner}


    def _lookup_job_status(self, job_id):
        """
        Will raise a ValueError if job_id doesn't exist.
        Sends the status over the comm channel as the usual job_status message.
        """
        status = self._construct_job_status(job_id)
        self._send_comm_message('job_status', status)

    def _lookup_all_job_status(self, ignore_refresh_flag=False):
        """
        Looks up status for all jobs.
        Once job info is acquired, it gets pushed to the front end over the
        'KBaseJobs' channel.
        """
        status_set = dict()
        # grab the list of running job ids, so we don't run into update-while-iterating problems.
        for job_id in self._running_jobs.keys():
            if self._running_jobs[job_id]['refresh'] or ignore_refresh_flag:
                status_set[job_id] = self._construct_job_status(job_id)
        self._send_comm_message('job_status_all', status_set)

    def _lookup_job_status_loop(self):
        """
        Initialize a loop that will look up job info. This uses a Timer thread on a 10
        second loop to update things.
        """
        self._lookup_all_job_status()
        self._lookup_timer = threading.Timer(10, self._lookup_job_status_loop)
        self._lookup_timer.start()

    def cancel_job_lookup_loop(self):
        """
        Cancels a running timer if one's still alive.
        """
        if self._lookup_timer:
            self._lookup_timer.cancel()
            self._lookup_timer = None
        self._running_lookup_loop = False

    def register_new_job(self, job):
        """
        Registers a new Job with the manager - should only be invoked when a new Job gets
        started. This stores the Job locally and pushes it over the comm channel to the
        Narrative where it gets serialized.

        Parameters:
        -----------
        job : biokbase.narrative.jobs.job.Job object
            The new Job that was started.
        """
        self._running_jobs[job.job_id] = {'job': job, 'refresh': True}
        # push it forward! create a new_job message.
        self._lookup_job_status(job.job_id)
        self._send_comm_message('new_job', {})

    def get_job(self, job_id):
        """
        Returns a Job with the given job_id.
        Raises a ValueError if not found.
        """
        if job_id in self._running_jobs:
            return self._running_jobs[job_id]['job']
        else:
            raise ValueError('No job present with id {}'.format(job_id))

    def _handle_comm_message(self, msg):
        """
        Handles comm messages that come in from the other end of the KBaseJobs channel.
        All messages (of any use) should have a 'request_type' property.
        Possible types:
        * all_status
            refresh all jobs that are flagged to be looked up. Will send a
            message back with all lookup status.
        * job_status
            refresh the single job given in the 'job_id' field. Sends a message
            back with that single job's status, or an error message.
        * stop_update_loop
            stop the running refresh loop, if there's one going (might be
            one more pass, depending on the thread state)
        * start_update_loop
            reinitialize the refresh loop.
        * stop_job_update
            flag the given job id (should be an accompanying 'job_id' field) that the front
            end knows it's in a terminal state and should no longer have its status looked
            up in the refresh cycle.
        * start_job_update
            remove the flag that gets set by stop_job_update (needs an accompanying 'job_id'
            field)
        """
        
        if 'request_type' in msg['content']['data']:
            r_type = msg['content']['data']['request_type']
            job_id = msg['content']['data'].get('job_id', None)
            if job_id is not None and job_id not in self._running_jobs:
                # If it's not a real job, just silently ignore the request.
                # Maybe return an error? Yeah. Let's do that.
                # self._send_comm_message('job_comm_error', {'job_id': job_id, 'message': 'Unknown job id', 'request_type': r_type})
                # TODO: perhaps we should implement request/response here. All we really need is to thread a message
                # id through
                self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'request_type': r_type})
                return

            if r_type == 'all_status':
                self._lookup_all_job_status(ignore_refresh_flag=True)

            elif r_type == 'job_status':
                if job_id is not None:
                    self._lookup_job_status(job_id)

            elif r_type == 'stop_update_loop':
                if self._lookup_timer is not None:
                    self._lookup_timer.cancel()

            elif r_type == 'start_update_loop':
                self._lookup_job_status_loop()

            elif r_type == 'stop_job_update':
                if job_id is not None:
                    self._running_jobs[job_id]['refresh'] = False

            elif r_type == 'start_job_update':
                if job_id is not None:
                    self._running_jobs[job_id]['refresh'] = True

            elif r_type == 'delete_job':
                if job_id is not None:
                    try:
                        self.delete_job(job_id)
                    except Exception as e:
                        self._send_comm_message('job_comm_error', {'message': str(e), 'request_type': r_type, 'job_id': job_id})

            elif r_type == 'cancel_job':
                if job_id is not None:
                    try:
                        self.cancel_job(job_id)
                    except Exception as e:
                        self._send_comm_message('job_comm_error', {'message': str(e), 'request_type': r_type, 'job_id': job_id})

            elif r_type == 'job_logs':
                if job_id is not None:
                    first_line = msg['content']['data'].get('first_line', 0)
                    num_lines = msg['content']['data'].get('num_lines', None)
                    self._get_job_logs(job_id, first_line=first_line, num_lines=num_lines)
                else:
                    raise ValueError('Need a job id to fetch jobs!')

            elif r_type == 'job_logs_latest':
                if job_id is not None:
                    num_lines = msg['content']['data'].get('num_lines', None)
                    self._get_latest_job_logs(job_id, num_lines=num_lines)

            else:
                self._send_comm_message('job_comm_error', {'message': 'Unknown message', 'request_type': r_type})
                raise ValueError('Unknown KBaseJobs message "{}"'.format(r_type))

    def _get_latest_job_logs(self, job_id, num_lines=None):
        job = self.get_job(job_id)
        if job is None:
            raise ValueError('job "{}" not found while fetching logs!'.format(job_id))

        (max_lines, logs) = job.log()

        first_line = 0
        if num_lines is not None and max_lines > num_lines:
            first_line = max_lines - num_lines
            logs = logs[first_line:]
        self._send_comm_message('job_logs', {'job_id': job_id, 'first': first_line, 'max_lines': max_lines, 'lines': logs, 'latest': True})


    def _get_job_logs(self, job_id, first_line=0, num_lines=None):
        job = self.get_job(job_id)
        if job is None:
            raise ValueError('job "{}" not found!'.format(job_id))

        (max_lines, log_slice) = job.log(first_line=first_line, num_lines=num_lines)
        self._send_comm_message('job_logs', {'job_id': job_id, 'first': first_line, 'max_lines': max_lines, 'lines': log_slice, 'latest': False})

    def delete_job(self, job_id):
        """
        If the job_id doesn't exist, raises a ValueError.
        Attempts to delete a job, and cancels it first. If the job cannot be canceled,
        raises an exception. If it can be canceled but not deleted, it gets canceled, then raises
        an exception.
        """
        if job_id is None:
            raise ValueError('Job id required for deletion!')
        if job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'delete_job'})
            return
            # raise ValueError('Attempting to cancel a Job that does not exist!')

        try:
            self.cancel_job(job_id)
        except Exception as e:
            raise

        try:
            clients.get('user_and_job_state').delete_job(job_id)
        except Exception as e:
            raise

        del self._running_jobs[job_id]
        self._send_comm_message('job_deleted', {'job_id': job_id})

    def cancel_job(self, job_id):
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        Raises an exception if the current user doesn't have permission to cancel the job.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'cancel_job'})
            return

        try:
            job = self.get_job(job_id)
            state = job.state()
            if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise ValueError('Unable to get Job state')

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        is_refreshing = self._running_jobs[job_id].get('refresh', False)
        self._running_jobs[job_id]['refresh'] = False
        self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('job_service').cancel_job({'job_id': job_id})
        except Exception as e:
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get cancel job',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'request_type': 'cancel_job',
                'job_id': job_id
            }
            self._send_comm_message('job_comm_error', error)
            raise(e)
        finally:
            self._running_jobs[job_id]['refresh'] = is_refreshing
            del self._running_jobs[job_id]['canceling']

        #
        # self._send_comm_message('job_canceled', {'job_id': job_id})
        # Rather than a separate message, how about triggering a job-status message:
        self._lookup_job_status(job_id)

    def _send_comm_message(self, msg_type, content):
        """
        Sends a ipykernel.Comm message to the KBaseJobs channel with the given msg_type
        and content. These just get encoded into the message itself.
        """

        msg = {
            'msg_type': msg_type,
            'content': content
        }
        if self._comm is None:
            self._comm = Comm(target_name='KBaseJobs', data={})
            self._comm.on_msg(self._handle_comm_message)
        self._comm.send(msg)
Exemplo n.º 13
0
class JobComm:
    """
    The main JobComm channel. This is the kernel-side of the connection, and routes
    requests for job information from various app cells (or the front end in general)
    to the right function.

    This has a handle on the JobManager, which does the work of fetching job information
    and statuses.

    The JobComm officially exposes the channel for other things to use. Anything that
    needs to send messages about Jobs to the front end should use JobComm.send_comm_message.

    It also maintains the lookup loop thread. This is a threading.Timer that, after
    some interval, will lookup the status of all running jobs. If there are no jobs to
    look up, this cancels itself.

    Allowed messages:
    * job_status - return the job state for a single job (requires a job_id)
    * job_status_all - return job state for all jobs in this Narrative.
    * job_info - return basic job info for a single job (requires a job_id)
    * start_job_update - tells the update loop to include a job when updating (requires a job_id)
    * stop_job_update - has the update loop not include a job when updating (requires a job_id)
    * cancel_job - cancels a running job, if it hasn't otherwise terminated (requires a job_id)
    * retry_job - retries a job (requires a job_id)
    * job_logs - sends job logs back over the comm channel (requires a job id)
    """

    # An instance of this class. It's meant to be a singleton, so this just gets created and
    # returned once.
    __instance = None

    # The kernel job comm channel that talks to the front end.
    _comm = None

    # The JobManager that actually manages things.
    _jm = None

    _msg_map = None
    _running_lookup_loop = False
    _lookup_timer = None
    _log = kblogging.get_logger(__name__)

    def __new__(cls):
        if JobComm.__instance is None:
            JobComm.__instance = object.__new__(cls)
        return JobComm.__instance

    def __init__(self):
        if self._comm is None:
            self._comm = Comm(target_name="KBaseJobs", data={})
            self._comm.on_msg(self._handle_comm_message)
        if self._jm is None:
            self._jm = JobManager()
        if self._msg_map is None:
            self._msg_map = {
                MESSAGE_TYPE["CANCEL"]: self._cancel_jobs,
                MESSAGE_TYPE["CELL_JOB_STATUS"]: self._get_job_states_by_cell_id,
                MESSAGE_TYPE["INFO"]: self._get_job_info,
                MESSAGE_TYPE["LOGS"]: self._get_job_logs,
                MESSAGE_TYPE["RETRY"]: self._retry_jobs,
                MESSAGE_TYPE["START_UPDATE"]: self._modify_job_updates,
                MESSAGE_TYPE["STATUS"]: self._get_job_states,
                MESSAGE_TYPE["STATUS_ALL"]: self._get_all_job_states,
                MESSAGE_TYPE["STOP_UPDATE"]: self._modify_job_updates,
            }

    def _get_job_ids(self, req: JobRequest = None):
        if req.has_batch_id():
            return self._jm.update_batch_job(req.batch_id)

        try:
            return req.job_id_list
        except Exception as ex:
            raise JobRequestException(ONE_INPUT_TYPE_ONLY_ERR) from ex

    def start_job_status_loop(
        self,
        init_jobs: bool = False,
        cell_list: List[str] = None,
    ) -> None:
        """
        Starts the job status lookup loop. This runs every LOOKUP_TIMER_INTERVAL seconds.

        :param init_jobs: If init_jobs=True, this attempts to (re-)initialize
            the JobManager's list of known jobs from the workspace.
        :param cell_list: from FE, the list of extant cell IDs
        """
        self._running_lookup_loop = True
        if init_jobs:
            try:
                self._jm.initialize_jobs(cell_list)
            except Exception as e:
                error = {
                    "error": "Unable to get initial jobs list",
                    "message": getattr(e, "message", UNKNOWN_REASON),
                    "code": getattr(e, "code", -1),
                    "source": getattr(e, "source", "jobmanager"),
                    "name": getattr(e, "name", type(e).__name__),
                }
                self.send_comm_message(MESSAGE_TYPE["ERROR"], error)
                # if job init failed, set the lookup loop var back to False and return
                self._running_lookup_loop = False
                return
        if self._lookup_timer is None:
            self._lookup_job_status_loop()

    def stop_job_status_loop(self, *args, **kwargs) -> None:
        """
        Stops the job status lookup loop if it's running. Otherwise, this effectively
        does nothing.
        """
        if self._lookup_timer:
            self._lookup_timer.cancel()
            self._lookup_timer = None
        self._running_lookup_loop = False

    def _lookup_job_status_loop(self) -> None:
        """
        Run a loop that will look up job info. After running, this spawns a Timer thread on
        a loop to run itself again. LOOKUP_TIMER_INTERVAL sets the frequency at which the loop runs.
        """
        all_job_states = self._get_all_job_states()
        if len(all_job_states) == 0 or not self._running_lookup_loop:
            self.stop_job_status_loop()
        else:
            self._lookup_timer = threading.Timer(
                LOOKUP_TIMER_INTERVAL, self._lookup_job_status_loop
            )
            self._lookup_timer.start()

    def _get_all_job_states(
        self, req: JobRequest = None, ignore_refresh_flag: bool = False
    ) -> dict:
        """
        Fetches status of all jobs in the current workspace and sends them to the front end.
        req can be None, as it's not used.
        """
        all_job_states = self._jm.get_all_job_states(
            ignore_refresh_flag=ignore_refresh_flag
        )
        self.send_comm_message(MESSAGE_TYPE["STATUS_ALL"], all_job_states)
        return all_job_states

    def _get_job_states_by_cell_id(self, req: JobRequest = None) -> dict:
        """
        Fetches status of all jobs associated with the given cell ID(s)
        :param req: a JobRequest with the cell_id_list of interest
        :returns: dict in the form
        {
            "jobs": {
                # dict with job IDs as keys and job states as values
                "job_one": { ... },
                "job_two": { ... },
            },
            "mapping": {
                # dict with cell IDs as keys and values being the set of job IDs associated
                # with that cell
                "cell_one": [ "job_one", "job_two", ... ],
                "cell_two": [ ... ],
            }
        }
        """
        cell_job_states = self._jm.get_job_states_by_cell_id(
            cell_id_list=req.cell_id_list
        )
        self.send_comm_message(MESSAGE_TYPE["CELL_JOB_STATUS"], cell_job_states)
        return cell_job_states

    def _get_job_info(self, req: JobRequest) -> dict:
        """
        Look up job info. This is just some high-level generic information about the running
        job, including the app id, name, and job parameters.
        :param req: a JobRequest with the job_id_list of interest
        :returns: a dict keyed with job IDs and with values of dicts with the following keys:
            - app_id - str - module/name,
            - app_name - str - name of the app as it shows up in the Narrative interface
            - batch_id - str - the batch parent ID (if appropriate)
            - job_id - str - just re-reporting the id string
            - job_params - dict - the params that were passed to that particular job
        """
        job_id_list = self._get_job_ids(req)
        job_info = self._jm.get_job_info(job_id_list)
        self.send_comm_message(MESSAGE_TYPE["INFO"], job_info)
        return job_info

    def __get_job_states(self, job_id_list) -> dict:
        """
        Look up job states.

        Returns a dictionary of job state information indexed by job ID.
        """
        output_states = self._jm.get_job_states(job_id_list)
        self.send_comm_message(MESSAGE_TYPE["STATUS"], output_states)
        return output_states

    def get_job_state(self, job_id: str) -> dict:
        """
        This differs from the _get_job_state (underscored version) in that
        it just takes a job_id string, not a JobRequest.
        """
        return self.__get_job_states([job_id])

    def _get_job_states(self, req: JobRequest) -> dict:
        job_id_list = self._get_job_ids(req)
        return self.__get_job_states(job_id_list)

    def _modify_job_updates(self, req: JobRequest) -> dict:
        """
        Modifies how many things want to listen to a job update.
        If this is a request to start a job update, then this starts the update loop that
        returns update messages across the job channel.
        If this is a request to stop a job update, then this sends that request to the
        JobManager, which might have the side effect of shutting down the update loop if there's
        no longer anything requesting job status.

        If the given job_id in the request doesn't exist in the current Narrative, or is None,
        this raises a JobRequestException.
        """
        job_id_list = self._get_job_ids(req)
        update_type = req.request_type
        if update_type == MESSAGE_TYPE["START_UPDATE"]:
            update_refresh = True
        elif update_type == MESSAGE_TYPE["STOP_UPDATE"]:
            update_refresh = False
        else:
            # this should be impossible
            raise JobRequestException("Unknown request")

        self._jm.modify_job_refresh(job_id_list, update_refresh)

        if update_refresh:
            self.start_job_status_loop()

        output_states = self._jm.get_job_states(job_id_list)
        self.send_comm_message(MESSAGE_TYPE["STATUS"], output_states)
        return output_states

    def _cancel_jobs(self, req: JobRequest) -> dict:
        """
        This cancels a running job.
        If there are no valid jobs, this raises a JobRequestException.
        If there's an error while attempting to cancel, this raises a NarrativeError.
        In the end, after a successful cancel, this finishes up by fetching and returning the
        job state with the new status.
        """
        job_id_list = self._get_job_ids(req)
        cancel_results = self._jm.cancel_jobs(job_id_list)
        self.send_comm_message(MESSAGE_TYPE["STATUS"], cancel_results)
        return cancel_results

    def _retry_jobs(self, req: JobRequest) -> dict:
        job_id_list = self._get_job_ids(req)
        retry_results = self._jm.retry_jobs(job_id_list)
        self.send_comm_message(MESSAGE_TYPE["RETRY"], retry_results)
        return retry_results

    def _get_job_logs(self, req: JobRequest) -> dict:
        """
        This returns a set of job logs based on the info in the request.
        """
        job_id_list = self._get_job_ids(req)
        log_output = self._jm.get_job_logs_for_list(
            job_id_list,
            num_lines=req.rq_data.get("num_lines", None),
            first_line=req.rq_data.get("first_line", 0),
            latest=req.rq_data.get("latest", False),
        )
        self.send_comm_message(MESSAGE_TYPE["LOGS"], log_output)
        return log_output

    def _handle_comm_message(self, msg: dict) -> dict:
        """
        Handles comm messages that come in from the other end of the KBaseJobs channel.
        Messages get translated into one or more JobRequest objects, which are then
        passed to the right handler, based on the request.

        A handler dictionary is created on JobComm creation.

        Any unknown request is returned over the channel with message type 'job_error', and a
        JobRequestException is raised.
        """
        with exc_to_msg(msg):
            request = JobRequest(msg)

            kblogging.log_event(
                self._log, "handle_comm_message", {"msg": request.request_type}
            )
            if request.request_type not in self._msg_map:
                raise JobRequestException(
                    f"Unknown KBaseJobs message '{request.request_type}'"
                )

            return self._msg_map[request.request_type](request)

    def send_comm_message(self, msg_type: str, content: dict) -> None:
        """
        Sends a ipykernel.Comm message to the KBaseJobs channel with the given msg_type
        and content. These just get encoded into the message itself.
        """
        msg = {"msg_type": msg_type, "content": content}
        self._comm.send(msg)

    def send_error_message(
        self, req: Union[JobRequest, dict, str], content: dict = None
    ) -> None:
        """
        Sends a comm message over the KBaseJobs channel as an error. This will have msg_type set to
        ERROR ('job_error'), and include the original request in the message content as
        "source".

        req can be the original request message or its JobRequest form.
        Since the latter is made from the former, they have the same information.
        It can also be a string or None if this context manager is invoked outside of a JC request

        This sends a packet that looks like:
        {
            request: the original JobRequest data object, function params, or function name
            source: the function request that spawned the error
            other fields about the error, dependent on the content.
        }
        """
        error_content = {}
        if isinstance(req, JobRequest):
            error_content["request"] = req.rq_data
            error_content["source"] = req.request_type
        elif isinstance(req, dict):
            data = req.get("content", {}).get("data", {})
            error_content["request"] = data
            error_content["source"] = data.get("request_type")
        elif isinstance(req, str) or req is None:
            error_content["request"] = req
            error_content["source"] = req

        if content is not None:
            error_content.update(content)

        self.send_comm_message(MESSAGE_TYPE["ERROR"], error_content)
Exemplo n.º 14
0
class AppManager(object):
    """
    The main class for managing how KBase apps get run. This contains functions
    for showing app descriptions, their usage (how to invoke various parameters),
    and, ultimately, for running the app.

    A typical flow might be like this.
    am = AppManager()
    am.available_apps()
        # show the set of apps with a brief description of each.
    am.app_usage(app_id)
        # show how to use a app and set its parameters.
    job = am.run_app(app_id, input1=value1, input2=value2, ...)
        # run an app with given inputs.
    """
    __instance = None

    nms = clients.get('narrative_method_store')
    njs = clients.get('job_service')
    ws_client = clients.get('workspace')
    spec_manager = SpecManager()
    _log = kblogging.get_logger(__name__)
    _comm = None
    viewer_count = 1

    def __new__(cls):
        if AppManager.__instance is None:
            AppManager.__instance = object.__new__(cls)
            AppManager.__instance._comm = None
        return AppManager.__instance

    def reload(self):
        """
        Reloads all app specs into memory from the App Catalog.
        Any outputs of app_usage, app_description, or available_apps
        should be run again after the update.
        """
        self.spec_manager.reload()

    def app_usage(self, app_id, tag='release'):
        """
        This shows the list of inputs and outputs for a given app with a given
        tag. By default, this is done in a pretty HTML way, but this app can be wrapped
        in str() to show a bare formatted string.

        If either the app_id is unknown, or isn't found with the given release tag,
        or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev (default=release)
        """
        return self.spec_manager.app_usage(app_id, tag)

    def app_description(self, app_id, tag='release'):
        """
        Returns the app description in a printable HTML format.

        If either the app_id is unknown, or isn't found with the given release tag,
        or if the tag is unknown, a ValueError will be raised.

        Parameters:
        -----------
        app_id : string
            A KBase app id, generally of the format Module_name/app_name
            (see available_apps for a list)
        tag : Which version of the app to view - either release, beta, or dev (default=release)
        """
        return self.spec_manager.app_description(app_id, tag)

    def available_apps(self, tag="release"):
        """
        Lists the set of available apps for a given tag in a simple table.
        If the tag is not found, a ValueError will be raised.

        Parameters:
        -----------
        tag : Which version of the list of apps to view - either release, beta, or dev (default=release)

        """
        return self.spec_manager.available_apps(tag)

    def run_local_app(self, app_id, params, tag="release", version=None, cell_id=None, run_id=None, **kwargs):
        """
        Attempts to run a local app. These do not return a Job object, but just the result of the app.
        In most cases, this will be a Javascript display of the result, but could be anything.

        If the app_spec looks like it makes a service call, then this raises a ValueError.
        Otherwise, it validates each parameter in **kwargs against the app spec, executes it, and
        returns the result.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'view_expression_profile'
        params - the dictionary of parameters for the app. Should be key-value
                 pairs where they keys are strings. If any non-optional
                 parameters are missing, an informative string will be printed.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        run_local_app('NarrativeViewers/view_expression_profile', version='0.0.1', input_expression_matrix="MyMatrix", input_gene_ids="1234")
        """
        try:
            if params is None:
                params = dict()
            return self._run_local_app_internal(app_id, params, tag, version, cell_id, run_id, **kwargs)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace('>', '&gt;')
            self._send_comm_message('run_status', {
                'event': 'error',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'error_message': e_message,
                'error_type': e_type,
                'error_stacktrace': e_trace
            })
            # raise
            print("Error while trying to start your app (run_local_app)!\n-------------------------------------\n" + str(e))

    def _run_local_app_internal(self, app_id, params, tag, version, cell_id, run_id, **kwargs):
        self._send_comm_message('run_status', {
            'event': 'validating_app',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id
        })

        ### TODO: this needs restructuring so that we can send back validation failure
        ### messages. Perhaps a separate function and catch the errors, or return an
        ### error structure.

        # Intro tests:
        self.spec_manager.check_app(app_id, tag, raise_exception=True)

        if version is not None and tag != "release":
            raise ValueError("App versions only apply to released app modules!")

        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)

        if 'behavior' not in spec:
            raise ValueError("This app appears invalid - it has no defined behavior")

        behavior = spec['behavior']

        if 'kb_service_input_mapping' in behavior:
            # it's a service! Should run this with run_app!
            raise ValueError('This app appears to be a long-running job! Please start it using the run_app function instead.')

        if 'script_module' in behavior or 'script_name' in behavior:
            # It's an old NJS script. These don't work anymore.
            raise ValueError('This app relies on a service that is now obsolete. Please contact the administrator.')

        # Here, we just deal with two behaviors:
        # 1. None of the above - it's a viewer.
        # 2. ***TODO*** python_class / python_function. Import and exec the python code.

        # for now, just map the inputs to outputs.
        # First, validate.
        # Preflight check the params - all required ones are present, all values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)
        (params, ws_refs) = self._validate_parameters(app_id, tag, spec_params, params)

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'username': system_variable('user_id'),
            'ws': system_variable('workspace')
        }
        kblogging.log_event(self._log, "run_local_app", log_info)

        self._send_comm_message('run_status', {
            'event': 'success',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id
        })

        # now just map onto outputs.
        (output_widget, widget_params) = map_outputs_from_state([], params, spec)
        return WidgetManager().show_output_widget(output_widget, widget_params, cell_id=cell_id, tag=tag)

    def run_widget_app(self, app_id, tag="release", version=None, cell_id=None, run_id=None):
        """
        Attempts to run a local app. These do not return a Job object, but just the result of the app.
        In most cases, this will be a Javascript display of the result, but could be anything.

        If the app_spec looks like it makes a service call, then this raises a ValueError.
        Otherwise, it validates each parameter in **kwargs against the app spec, executes it, and
        returns the result.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'view_expression_profile'
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        run_local_app('NarrativeViewers/view_expression_profile', version='0.0.1', input_expression_matrix="MyMatrix", input_gene_ids="1234")
        """
        try:
            return self._run_widget_app_internal(app_id, tag, version, cell_id, run_id)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace('>', '&gt;')
            self._send_comm_message('run_status', {
                'event': 'error',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'error_message': e_message,
                'error_type': e_type,
                'error_stacktrace': e_trace
            })
            # raise
            print("Error while trying to start your app (run_widget_app)!\n-------------------------------------\n" + str(e))

    def _run_widget_app_internal(self, app_id, tag, version, cell_id, run_id):
        self._send_comm_message('run_status', {
            'event': 'validating_app',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id
        })

        # Intro tests:
        self.spec_manager.check_app(app_id, tag, raise_exception=True)

        if version is not None and tag != "release":
            raise ValueError("App versions only apply to released app modules!")

        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)

        if 'behavior' not in spec:
            raise ValueError("This app appears invalid - it has no defined behavior")

        behavior = spec['behavior']

        if 'kb_service_input_mapping' in behavior:
            # it's a service! Should run this with run_app!
            raise ValueError('This app appears to be a long-running job! Please start it using the run_app function instead.')

        if 'script_module' in behavior or 'script_name' in behavior:
            # It's an old NJS script. These don't work anymore.
            raise ValueError('This app relies on a service that is now obsolete. Please contact the administrator.')

        # Here, we just deal with two behaviors:
        # 1. None of the above - it's a viewer.
        # 2. ***TODO*** python_class / python_function. Import and exec the python code.

        # for now, just map the inputs to outputs.
        # First, validate.
        # Preflight check the params - all required ones are present, all values are the right type, all numerical values are in given ranges
        #spec_params = self.spec_manager.app_params(spec)
        #(params, ws_refs) = self._validate_parameters(app_id, tag, spec_params, kwargs)

        log_info = {
            'app_id': app_id,
            'tag': tag,
            'username': system_variable('user_id'),
            'ws': system_variable('workspace')
        }
        kblogging.log_event(self._log, "run_widget_app", log_info)

        self._send_comm_message('run_status', {
            'event': 'success',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id
        })

        # now just map onto outputs.
        custom_widget = spec.get('widgets', {}).get('input', None)
        return WidgetManager().show_custom_widget(custom_widget, app_id, version, tag, spec, cell_id)

    def _validate_parameters(self, app_id, tag, spec_params, params):
        """
        Validates the dict of params against the spec_params. If all is good, it updates a few
        parameters that need it - checkboxes go from True/False to 1/0, and sets default values
        where necessary.
        Then it returns a tuple like this:
        (dict_of_params, list_of_ws_refs)
        where list_of_ws_refs is the list of workspace references for objects being passed into
        the app.

        If it fails, this will raise a ValueError with a description of the problem and a
        (hopefully useful!) hint for the user as to what went wrong.
        """
        spec_param_ids = [p['id'] for p in spec_params]

        # First, test for presence.
        missing_params = list()
        for p in spec_params:
            if not p['optional'] and not p['default'] and not params.get(p['id'], None):
                missing_params.append(p['id'])
        if len(missing_params):
            raise ValueError('Missing required parameters {} - try executing app_usage("{}", tag="{}") for more information'.format(json.dumps(missing_params), app_id, tag))

        # Next, test for extra params that don't make sense
        extra_params = list()
        for p in params.keys():
            if p not in spec_param_ids:
                extra_params.append(p)
        if len(extra_params):
            raise ValueError('Unknown parameters {} - maybe something was misspelled?\nexecute app_usage("{}", tag="{}") for more information'.format(json.dumps(extra_params), app_id, tag))

        # Now, validate parameter values.
        # Should also check if input (NOT OUTPUT) object variables are present in the current workspace
        workspace = system_variable('workspace')
        ws_id = system_variable('workspace_id')
        if workspace is None or ws_id is None:
            raise ValueError('Unable to retrive current Narrative workspace information! workspace={}, workspace_id={}'.format(workspace, ws_id))

        param_errors = list()
        # If they're workspace objects, track their refs in a list we'll pass to run_job as
        # a separate param to track provenance.
        ws_input_refs = list()
        for p in spec_params:
            if p['id'] in params:
                (wsref, err) = self._check_parameter(p, params[p['id']], workspace)
                if err is not None:
                    param_errors.append("{} - {}".format(p['id'], err))
                if wsref is not None:
                    if isinstance(wsref, list):
                        for ref in wsref:
                            if ref is not None:
                                ws_input_refs.append(ref)
                    else:
                        ws_input_refs.append(wsref)
        if len(param_errors):
            raise ValueError('Parameter value errors found!\n{}'.format("\n".join(param_errors)))

        # Hooray, parameters are validated. Set them up for transfer.
        for p in spec_params:
            # If any param is a checkbox, need to map from boolean to actual expected value in p['checkbox_map']
            # note that True = 0th elem, False = 1st
            if p['type'] == 'checkbox':
                if p['id'] in params:
                    checkbox_idx = 0 if params[p['id']] else 1
                    params[p['id']] = p['checkbox_map'][checkbox_idx]
            # While we're at it, set the default values for any unset parameters that have them
            if p['default'] and p['id'] not in params:
                params[p['id']] = p['default']

        return (params, ws_input_refs)

    def run_app(self, app_id, params, tag="release", version=None, cell_id=None, run_id=None, **kwargs):
        """
        Attempts to run the app, returns a Job with the running app info.
        If this is given a cell_id, then returns None. If not, it returns the generated
        Job object.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - this is hte dictionary of parameters to tbe used with the app.
                 They can be found by using the app_usage function. If any
                 non-optional apps are missing, a ValueError will be raised.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        --------
        run_app('MegaHit/run_megahit', version=">=1.0.0", read_library_name="My_PE_Library", output_contigset_name="My_Contig_Assembly")
        """

        try:
            if params is None:
                params = dict()
            return self._run_app_internal(app_id, params, tag, version, cell_id, run_id, **kwargs)
        except Exception as e:
            e_type = type(e).__name__
            e_message = str(e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace('>', '&gt;')
            e_code = getattr(e, 'code', -1)
            e_source = getattr(e, 'source', 'appmanager')
            self._send_comm_message('run_status', {
                'event': 'error',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'error_message': e_message,
                'error_type': e_type,
                'error_stacktrace': e_trace,
                'error_code': e_code,
                'error_source': e_source
            })
            print("Error while trying to start your app (run_app)!\n-------------------------------------\n" + str(e))
            return

    def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id, **kwargs):
        """
        Attemps to run the app, returns a Job with the running app info.
        Should *hopefully* also inject that app into the Narrative's metadata.
        Probably need some kind of JavaScript-foo to get that to work.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - the dictionary of parameters.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        --------
        my_job = mm.run_app('MegaHit/run_megahit', version=">=1.0.0", read_library_name="My_PE_Library", output_contigset_name="My_Contig_Assembly")
        """

        ### TODO: this needs restructuring so that we can send back validation failure
        ### messages. Perhaps a separate function and catch the errors, or return an
        ### error structure.

        # Intro tests:
        self.spec_manager.check_app(app_id, tag, raise_exception=True)

        if version is not None and tag != "release":
            if re.match(version, '\d+\.\d+\.\d+') is not None:
                raise ValueError("Semantic versions only apply to released app modules. You can use a Git commit hash instead to specify a version.")

        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)

        # There's some branching to do here.
        # Cases:
        # app has behavior.kb_service_input_mapping -- is a valid long-running app.
        # app only has behavior.output_mapping - not kb_service_input_mapping or script_module - it's a viewer and should return immediately
        # app has other things besides kb_service_input_mapping -- not a valid app.
        if 'behavior' not in spec:
            raise Exception("This app appears invalid - it has no defined behavior")

        if  'kb_service_input_mapping' not in spec['behavior']:
            raise Exception("This app does not appear to be a long-running job! Please use 'run_local_app' to start this instead.")

        # Preflight check the params - all required ones are present, all values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        (params, ws_input_refs) = self._validate_parameters(app_id, tag, spec_params, params)

        ws_id = system_variable('workspace_id')
        if ws_id is None:
            raise ValueError('Unable to retrive current Narrative workspace information!')

        input_vals = self._map_inputs(spec['behavior']['kb_service_input_mapping'], params)

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)
        service_url = spec['behavior']['kb_service_url']


        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + '.' + service_method
        job_meta = {'tag': tag}
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # This is the input set for NJSW.run_job. Now we need the worksapce id and whatever fits in the metadata.
        job_runner_inputs = {
            'method': function_name,
            'service_ver': service_ver,
            'params': input_vals,
            'app_id': app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        if len(ws_input_refs) > 0:
            job_runner_inputs['source_ws_objects'] = ws_input_refs

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = self.njs.run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      app_id,
                      [params],
                      system_variable('user_id'),
                      tag=tag,
                      app_version=service_ver,
                      cell_id=cell_id,
                      run_id=run_id)

        self._send_comm_message('run_status', {
            'event': 'launched_job',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id,
            'job_id': job_id
        })
        JobManager().register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job

    def _map_inputs(self, input_mapping, params):
        """
        Maps the dictionary of parameters and inputs based on rules provided in the input_mapping.
        This iterates over the list of input_mappings, and uses them as a filter to apply to each
        parameter.

        Returns a list of inputs that can be passed directly to NJSW.run_job

        input_mapping is a list of dicts, as defined by NarrativeMethodStore.ServiceMethodInputMapping.
        params is a dict of key-value-pairs, each key is the input_parameter field of some parameter.
        """
        inputs_dict = dict()
        for p in input_mapping:
            # 2 steps - figure out the proper value, then figure out the proper position.
            # value first!
            p_value = None
            if 'input_parameter' in p:
                p_value = params.get(p['input_parameter'], None)
                # turn empty strings into None
                if isinstance(p_value, basestring) and len(p_value) == 0:
                    p_value = None
            elif 'narrative_system_variable' in p:
                p_value = system_variable(p['narrative_system_variable'])
            if 'constant_value' in p and p_value is None:
                p_value = p['constant_value']
            if 'generated_value' in p and p_value is None:
                p_value = self._generate_input(generated_value)
            if 'target_type_transform' in p:
                p_value = self._transform_input(p['target_type_transform'], p_value)

            # get position!
            arg_position = p.get('target_argument_position', 0)
            target_prop = p.get('target_property', None)
            if target_prop is not None:
                final_input = inputs_dict.get(arg_position, dict())
                if '/' in target_prop:
                    ## This is case when slashes in target_prop separeate elements in nested maps.
                    ## We ignore escaped slashes (separate backslashes should be escaped as well).
                    bck_slash = u"\u244A"
                    fwd_slash = u"\u20EB"
                    temp_string = target_prop.replace("\\\\", bck_slash).replace("\\/", fwd_slash)
                    temp_path = []
                    for part in temp_string.split("/"):
                        part = part.replace(bck_slash, "\\").replace(fwd_slash, "/")
                        temp_path.append(part.encode('ascii','ignore'))
                    temp_map = final_input
                    temp_key = None
                    ## We're going along the path and creating intermediate dictionaries.
                    for temp_path_item in temp_path:
                        if temp_key:
                            if temp_key not in temp_map:
                                temp_map[temp_key] = {}
                            temp_map = temp_map[temp_key]
                        temp_key = temp_path_item
                    ## temp_map points to deepest nested map now, temp_key is last item in path
                    temp_map[temp_key] = p_value
                else:
                    final_input[target_prop] = p_value
                inputs_dict[arg_position] = final_input
            else:
                inputs_dict[arg_position] = p_value

        inputs_list = list()
        keys = sorted(inputs_dict.keys())
        for k in keys:
            inputs_list.append(inputs_dict[k])
        return inputs_list

    def _transform_input(self, transform_type, value):
        """
        Transforms an input according to the rules given in NarrativeMethodStore.ServiceMethodInputMapping
        Really, there are three types of transforms possible:
          1. ref - turns the input string into a workspace ref.
          2. int - tries to coerce the input string into an int.
          3. list<type> - turns the given list into a list of the given type.
          (4.) none or None - doesn't transform.

        Returns a transformed (or not) value.
        """
        if transform_type == "none" or transform_type is None:
            return value

        elif transform_type == "ref":
            # make a workspace ref
            if value is not None:
                value = system_variable('workspace') + '/' + value
            return value

        elif transform_type == "int":
            # make it an integer, OR 0.
            if value is None or len(str(value).strip()) == 0:
                return None
            return int(value)

        elif transform_type.startswith("list<") and transform_type.endswith(">"):
            # make it a list of transformed types.
            list_type = transform_type[5:-1]
            if isinstance(value, list):
                ret = []
                for pos in range(0, len(value)):
                    ret.append(self._transform_input(list_type, value[pos]))
                return ret
            else:
                return [self._transform_input(list_type, value)]

        else:
            raise ValueError("Unsupported Transformation type: " + transform_type)


    def _generate_input(self, generator):
        """
        Generates an input value using rules given by NarrativeMethodStore.AutoGeneratedValue.
        generator - dict
            has 3 optional properties:
            prefix - if present, is prepended to the generated string.
            symbols - if present is the number of symbols to autogenerate (if not present, default=8)
            suffix - if present, is appended to the generated string.
        So, if generator is None or an empty dict, returns an 8-symbol string.
        """
        symbols = 8
        if 'symbols' in generator:
            symbols = int(generator['symbols'])
        ret = ''.join([chr(random.randrange(0, 26) + ord('A')) for _ in xrange(symbols)])
        if 'prefix' in generator:
            ret = str(generator['prefix']) + ret
        if 'suffix' in generator:
            ret = ret + str(generator['suffix'])
        return ret

    def _check_parameter(self, param, value, workspace):
        """
        Checks if the given value matches the rules provided in the param dict.
        If yes, returns None
        If no, returns a String with an error.

        This is a pretty light wrapper around _validate_param_value that handles the case
        where the given value is a list.

        Parameters:
        -----------
        param : dict
            A dict representing a single KBase App parameter, generated by the Spec Manager
        value : any
            A value input by the user
        workspace : string
            The name of the current workspace to search against (if needed)
        """
        if param['allow_multiple'] and isinstance(value, list):
            ws_refs = list()
            error_list = list()
            for v in value:
                (ref, err) = self._validate_param_value(param, v, workspace)
                if err:
                    error_list.append(err)
                if ref:
                    ws_refs.append(ref)
            if len(error_list):
                return (None, "\n\t".join(error_list))
            else:
                return (ws_refs, None)
        return self._validate_param_value(param, value, workspace)

    def _validate_param_value(self, param, value, workspace):
        """
        Tests a value to make sure it's valid, based on the rules given in the param dict.
        Returns None if valid, an error string if not.

        Parameters:
        -----------
        param : dict
            A dict representing a single KBase App parameter, generated by the Spec Manager.
            This contains the rules for processing any given values.
        value : any
            A value input by the user - likely either None, int, float, string, or list
        workspace : string
            The name of the current workspace to test workspace object types against, if
            required by the parameter.
        """
        # The workspace reference for the parameter. Can be None, and returned as such.
        ws_ref = None

        # allow None to pass, we'll just pass it to the method and let it get rejected there.
        if value is None:
            return (ws_ref, None)

        # Also, for strings, last I heard, an empty string is the same as null/None
        if param['type'] in ['string', 'dropdown', 'checkbox'] and isinstance(value, basestring) and value == '':
            return (ws_ref, None)

        # cases - value == list (checked by wrapping function, _check_parameter), int, float, others get rejected
        if not (isinstance(value, basestring) or
                isinstance(value, int) or
                isinstance(value, float)):
            return (ws_ref, "input type not supported - only str, int, float, or list")

        # check types. basestring is pretty much anything (it'll just get casted),
        # but ints, floats, or lists are funky.
        if param['type'] == 'int' and not isinstance(value, int):
            return (ws_ref, 'Given value {} is not an int'.format(value))
        elif param['type'] == 'float' and not (isinstance(value, float) or isinstance(value, int)):
            return (ws_ref, 'Given value {} is not a number'.format(value))

        # if it's expecting a workspace object, check if that's present, and a valid type
        if 'allowed_types' in param and len(param['allowed_types']) > 0 and not param['is_output']:
            try:
                info = self.ws_client.get_object_info_new({'objects': [{'workspace': workspace, 'name': value}]})[0]
                ws_ref = "{}/{}/{}".format(info[6], info[0], info[4])
                type_ok = False
                for t in param['allowed_types']:
                    if re.match(t, info[2]):
                        type_ok = True
                if not type_ok:
                    return (ws_ref, 'Type of data object, {}, does not match allowed types'.format(info[2]))
            except Exception as e:
                return (ws_ref, 'Data object named {} not found with this Narrative.'.format(value))

        # if it expects a set of allowed values, check if this one matches
        if 'allowed_values' in param:
            if value not in param['allowed_values']:
                return (ws_ref, "Given value '{}' is not permitted in the allowed set.".format(value))

        # if it expects a numerical value in a certain range, check that.
        if 'max_val' in param:
            try:
                if float(value) > param['max_val']:
                    return (ws_ref, "Given value {} should be <= {}".format(value, param['max_val']))
            except:
                return (ws_ref, "Given value {} must be a number".format(value))

        if 'min_val' in param:
            try:
                if float(value) < param['min_val']:
                    return (ws_ref, "Given value {} should be >= {}".format(value, param['min_val']))
            except:
                return (ws_ref, "Given value {} must be a number".format(value))

        # if it's an output object, make sure it follows the data object rules.
        if param['is_output']:
            if re.search('\s', value):
                return (ws_ref, "Spaces are not allowed in data object names.")
            if re.match('^\d+$', value):
                return (ws_ref, "Data objects cannot be just a number.")
            if not re.match('^[a-z0-9|\.|\||_\-]*$', value, re.IGNORECASE):
                return (ws_ref, "Data object names can only include symbols: _ - . |")

        # Last, regex. not being used in any extant specs, but cover it anyway.
        if 'regex_constraint' in param:
            for regex in regex_constraint:
                if not re.match(regex_constraint, value):
                    return (ws_ref, 'Value {} does not match required regex {}'.format(value, regex))

        # Whew. Passed all filters!
        return (ws_ref, None)

    def _send_comm_message(self, msg_type, content):
        JobManager()._send_comm_message(msg_type, content)
Exemplo n.º 15
0
class JobManager(object):
    """
    The KBase Job Manager class. This handles all jobs and makes their status available.
    On status lookups, it feeds the results to the KBaseJobs channel that the front end
    listens to.
    """
    __instance = None

    # keys = job_id, values = { refresh = T/F, job = Job object }
    _running_jobs = dict()
    # keys = job_id, values = state from either Job object or NJS (these are identical)
    _completed_job_states = dict()

    _lookup_timer = None
    _comm = None
    _log = kblogging.get_logger(__name__)
    # TODO: should this not be done globally?
    _running_lookup_loop = False

    def __new__(cls):
        if JobManager.__instance is None:
            JobManager.__instance = object.__new__(cls)
        return JobManager.__instance

    def initialize_jobs(self, start_lookup_thread=True):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params (also gets app_id)
        4. start the status lookup loop.
        """

        the_time = int(round(time.time() * 1000))

        self._send_comm_message('start', {'time': the_time})

        ws_id = system_variable('workspace_id')
        try:
            nar_jobs = clients.get('user_and_job_state').list_jobs2({
                'authstrat': 'kbaseworkspace',
                'authparams': [str(ws_id)]
            })
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get initial jobs list',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'service': 'user_and_job_state'
            }
            self._send_comm_message('job_init_err', error)
            raise new_e

        job_ids = [j[0] for j in nar_jobs]
        job_states = clients.get('job_service').check_jobs({
            'job_ids': job_ids, 'with_job_params': 1
        })
        job_param_info = job_states.get('job_params', {})
        job_check_error = job_states.get('check_error', {})
        error_jobs = dict()
        for info in nar_jobs:
            job_id = info[0]
            user_info = info[1]
            job_meta = info[10]
            try:
                if job_id in job_param_info:
                    job_info = job_param_info[job_id]

                    job = Job.from_state(job_id,
                                         job_info,
                                         user_info[0],
                                         app_id=job_info.get('app_id'),
                                         tag=job_meta.get('tag', 'release'),
                                         cell_id=job_meta.get('cell_id', None),
                                         run_id=job_meta.get('run_id', None),
                                         token_id=job_meta.get('token_id', None),
                                         meta=job_meta)

                    # Note that when jobs for this narrative are initially loaded,
                    # they are set to not be refreshed. Rather, if a client requests
                    # updates via the start_job_update message, the refresh flag will
                    # be set to True.
                    self._running_jobs[job_id] = {
                        'refresh': 0,
                        'job': job
                    }
                elif job_id in job_check_error:
                    job_err_state = {
                        'job_state': 'error',
                        'error': {
                            'error': 'KBase execution engine returned an error while looking up this job.',
                            'message': job_check_error[job_id].get('message', 'No error message available'),
                            'name': 'Job Error',
                            'code': job_check_error[job_id].get('code', -999),
                            'exception': {
                                'error_message': 'Job lookup in execution engine failed',
                                'error_type': job_check_error[job_id].get('name', 'unknown'),
                                'error_stacktrace': job_check_error[job_id].get('error', '')
                            }
                        },
                        'cell_id': job_meta.get('cell_id', None),
                        'run_id': job_meta.get('run_id', None),
                    }
                    error_jobs[job_id] = job_err_state

            except Exception as e:
                kblogging.log_event(self._log, 'init_error', {'err': str(e)})
                new_e = transform_job_exception(e)
                error = {
                    'error': 'Unable to get job info on initial lookup',
                    'job_id': job_id,
                    'message': getattr(new_e, 'message', 'Unknown reason'),
                    'code': getattr(new_e, 'code', -1),
                    'source': getattr(new_e, 'source', 'jobmanager'),
                    'name': getattr(new_e, 'name', type(e).__name__),
                    'service': 'job_service'
                }
                self._send_comm_message('job_init_lookup_err', error)
                raise new_e  # should crash and burn on any of these.

        if len(job_check_error):
            err_str = 'Unable to find info for some jobs on initial lookup'
            err_type = 'job_init_partial_err'
            if len(job_check_error) == len(nar_jobs):
                err_str = 'Unable to get info for any job on initial lookup'
                err_type = 'job_init_lookup_err'
            error = {
                'error': err_str,
                'job_errors': error_jobs,
                'message': 'Job information was unavailable from the server',
                'code': -2,
                'source': 'jobmanager',
                'name': 'jobmanager',
                'service': 'job_service',
            }
            self._send_comm_message(err_type, error)

        if not self._running_lookup_loop and start_lookup_thread:
            # only keep one loop at a time in cause this gets called again!
            if self._lookup_timer is not None:
                self._lookup_timer.cancel()
            self._running_lookup_loop = True
            self._lookup_job_status_loop()
        else:
            self._lookup_all_job_status()

    def _create_jobs(self, job_ids):
        """
        TODO: error handling
        Makes a bunch of Job objects from job_ids.
        Initially used to make Child jobs from some parent, but will eventually be adapted to all jobs on startup.
        Just slaps them all into _running_jobs
        """
        job_states = clients.get('job_service').check_jobs({'job_ids': job_ids, 'with_job_params': 1})
        for job_id in job_ids:
            ujs_info = clients.get('user_and_job_state').get_job_info2(job_id)

            if job_id in job_ids and job_id not in self._running_jobs:
                job_info = job_states.get('job_params', {}).get(job_id, {})
                job_meta = ujs_info[10]
                job = Job.from_state(job_id,                                     # the id
                                     job_info,                                   # params, etc.
                                     ujs_info[2],                                # owner id
                                     app_id=job_info.get('app_id', job_info.get('method')),
                                     tag=job_meta.get('tag', 'release'),
                                     cell_id=job_meta.get('cell_id', None),
                                     run_id=job_meta.get('run_id', None),
                                     token_id=job_meta.get('token_id', None),
                                     meta=job_meta)

                # Note that when jobs for this narrative are initially loaded,
                # they are set to not be refreshed. Rather, if a client requests
                # updates via the start_job_update message, the refresh flag will
                # be set to True.
                self._running_jobs[job_id] = {
                    'refresh': 0,
                    'job': job
                }

    def list_jobs(self):
        """
        List all job ids, their info, and status in a quick HTML format.
        """
        try:
            status_set = list()
            for job_id in self._running_jobs:
                job = self._running_jobs[job_id]['job']
                job_state = self._get_job_state(job_id)
                job_state['app_id'] = job.app_id
                job_state['owner'] = job.owner
                status_set.append(job_state)
            if not len(status_set):
                return "No running jobs!"
            status_set = sorted(status_set, key=lambda s: s['creation_time'])
            for i in range(len(status_set)):
                status_set[i]['creation_time'] = datetime.datetime.strftime(datetime.datetime.fromtimestamp(status_set[i]['creation_time']/1000), "%Y-%m-%d %H:%M:%S")
                exec_start = status_set[i].get('exec_start_time', None)
                if 'finish_time' in status_set[i]:
                    finished = status_set[i].get('finish_time', None)
                    if finished is not None and exec_start:
                        delta = datetime.datetime.fromtimestamp(finished/1000.0) - datetime.datetime.fromtimestamp(exec_start/1000.0)
                        delta = delta - datetime.timedelta(microseconds=delta.microseconds)
                        status_set[i]['run_time'] = str(delta)
                        status_set[i]['finish_time'] = datetime.datetime.strftime(datetime.datetime.fromtimestamp(status_set[i]['finish_time']/1000), "%Y-%m-%d %H:%M:%S")
                elif exec_start:
                    delta = datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(exec_start/1000.0)
                    delta = delta - datetime.timedelta(microseconds=delta.microseconds)
                    status_set[i]['run_time'] = str(delta)
                else:
                    status_set[i]['run_time'] = 'Not started'

            tmpl = """
            <table class="table table-bordered table-striped table-condensed">
                <tr>
                    <th>Id</th>
                    <th>Name</th>
                    <th>Submitted</th>
                    <th>Submitted By</th>
                    <th>Status</th>
                    <th>Run Time</th>
                    <th>Complete Time</th>
                </tr>
                {% for j in jobs %}
                <tr>
                    <td>{{ j.job_id|e }}</td>
                    <td>{{ j.app_id|e }}</td>
                    <td>{{ j.creation_time|e }}</td>
                    <td>{{ j.owner|e }}</td>
                    <td>{{ j.job_state|e }}</td>
                    <td>{{ j.run_time|e }}</td>
                    <td>{% if j.finish_time %}{{ j.finish_time|e }}{% else %}Incomplete{% endif %}</td>
                </tr>
                {% endfor %}
            </table>
            """
            return HTML(Template(tmpl).render(jobs=status_set))

        except Exception as e:
            kblogging.log_event(self._log, "list_jobs.error", {'err': str(e)})
            raise

    def get_jobs_list(self):
        """
        A convenience method for fetching an unordered list of all running Jobs.
        """
        return [j['job'] for j in self._running_jobs.values()]

    def _construct_job_status(self, job, state):
        """
        Creates a Job status dictionary with structure:
        {
            owner: string (username),
            spec: app_spec (from NMS, via biokbase.narrative.jobs.specmanager)
            widget_info: (if not finished, None, else...) job.get_viewer_params result
            state: {
                job_state: string,
                error (if present): dict of error info,
                cell_id: string/None,
                run_id: string/None,
                awe_job_id: string/None,
                canceled: 0/1
                creation_time: epoch second
                exec_start_time: epoch/none,
                finish_time: epoch/none,
                finished: 0/1,
                job_id: string,
                status: (from UJS) [
                    timestamp(last_update, string),
                    stage (string),
                    status (string),
                    progress (string/None),
                    est_complete (string/None),
                    complete (0/1),
                    error (0/1)
                ],
                ujs_url: string
            }
        }
        """
        widget_info = None
        app_spec = {}

        if job is None:
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Job does not seem to exist, or it is otherwise unavailable.',
                    'message': 'Job does not exist',
                    'name': 'Job Error',
                    'code': -1,
                    'exception': {
                        'error_message': 'job not found in JobManager',
                        'error_type': 'ValueError',
                        'error_stacktrace': ''
                    }
                },
                'cell_id': None,
                'run_id': None,
            }
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        # try:
        #     app_spec = job.app_spec()
        # except Exception as e:
        #     kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

        if state is None:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': 'Unable to get job state for job {}'.format(job.job_id)})

            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to find current job state. Please try again later, or contact KBase.',
                    'message': 'Unable to return job state',
                    'name': 'Job Error',
                    'code': -1,
                    'source': 'JobManager._construct_job_status',
                    'exception': {
                        'error_message': 'No state provided during lookup',
                        'error_type': 'null-state',
                        'error_stacktrace': '',
                    }
                },
                'creation_time': 0,
                'cell_id': job.cell_id,
                'run_id': job.run_id,
                'job_id': job.job_id
            }

        elif 'lookup_error' in state:
            kblogging.log_event(self._log, "lookup_job_status.error", {
                'err': 'Problem while getting state for job {}'.format(job.job_id),
                'info': str(state['lookup_error'])
            })
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to fetch current state. Please try again later, or contact KBase.',
                    'message': 'Error while looking up job state',
                    'name': 'Job Error',
                    'code': -1,
                    'source': 'JobManager._construct_job_status',
                    'exception': {
                        'error_message': 'Error while fetching job state',
                        'error_type': 'failed-lookup',
                    },
                    'error_response': state['lookup_error'],
                    'creation_time': 0,
                    'cell_id': job.cell_id,
                    'run_id': job.run_id,
                    'job_id': job.job_id
                }
            }
        if state.get('finished', 0) == 1:
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})
                state['job_state'] = 'error'
                state['error'] = {
                    'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.',
                    'message': 'Unable to build output viewer parameters!',
                    'name': 'App Error',
                    'code': getattr(new_e, "code", -1),
                    'source': getattr(new_e, "source", "JobManager")
                }

        if 'canceling' in self._running_jobs[job.job_id]:
            state['job_state'] = 'canceling'

        state.update({
            'child_jobs': self._child_job_states(
                state.get('sub_jobs', []),
                job.meta.get('batch_app'),
                job.meta.get('batch_tag')
            )
        })
        if 'batch_size' in job.meta:
            state.update({'batch_size': job.meta['batch_size']})
        return {'state': state,
                'spec': app_spec,
                'widget_info': widget_info,
                'owner': job.owner,
                'listener_count': self._running_jobs[job.job_id]['refresh']}

    def _child_job_states(self, sub_job_list, app_id, app_tag):
        """
        Fetches state for all jobs in the list. These are expected to be child jobs, with no actual Job object associated.
        So if they're done, we need to do the output mapping out of band.
        But the check_jobs call with params will return the app id. So that helps.

        app_id = the id of the app that all the child jobs are running (format: module/method, like "MEGAHIT/run_megahit")
        app_tag = one of "release", "beta", "dev"
        (the above two aren't stored with the subjob metadata, and won't until we back some more on KBParallel - I want to
        lobby for pushing toward just starting everything up at once from here and letting HTCondor deal with allocation)
        sub_job_list = list of ids of jobs to look up
        """
        if not sub_job_list:
            return []

        sub_job_list = sorted(sub_job_list)
        job_info = clients.get('job_service').check_jobs({'job_ids': sub_job_list, 'with_job_params': 1})
        child_job_states = list()

        for job_id in sub_job_list:
            params = job_info['job_params'][job_id]
            # if it's error, get the error.
            if job_id in job_info['check_error']:
                error = job_info['check_error'][job_id]
                error.update({'job_id': job_id})
                child_job_states.append(error)
                continue
            # if it's done, get the output mapping.
            state = job_info['job_states'][job_id]
            if state.get('finished', 0) == 1:
                try:
                    widget_info = Job.map_viewer_params(
                        state,
                        params['params'],
                        app_id,
                        app_tag
                    )
                except ValueError:
                    widget_info = {}
                state.update({'widget_info': widget_info})
            child_job_states.append(state)
        return child_job_states

    def _construct_job_status_set(self, job_ids):
        job_states = self._get_all_job_states(job_ids)

        status_set = dict()
        for job_id in job_ids:
            job = None
            if job_id in self._running_jobs:
                job = self._running_jobs[job_id]['job']
            status_set[job_id] = self._construct_job_status(job, job_states.get(job_id, None))
        return status_set

    def _verify_job_parentage(self, parent_job_id, child_job_id):
        """
        Validate job relationships.
        1. Make sure parent exists, and the child id is in its list of sub jobs.
        2. If child doesn't exist, create it and add it to the list.
        If parent doesn't exist, or child isn't an actual child, raise an exception
        """
        if parent_job_id not in self._running_jobs:
            raise ValueError('Parent job id {} not found, cannot validate child job {}.'.format(parent_job_id, child_job_id))
        if child_job_id not in self._running_jobs:
            parent_job = self.get_job(parent_job_id)
            parent_state = parent_job.state()
            if child_job_id not in parent_state.get('sub_jobs', []):
                raise ValueError('Child job id {} is not a child of parent job {}'.format(child_job_id, parent_job_id))
            else:
                self._create_jobs([child_job_id])
                # injects its app id and version
                child_job = self.get_job(child_job_id)
                child_job.app_id = parent_job.meta.get('batch_app')
                child_job.tag = parent_job.meta.get('batch_tag', 'release')


    def _lookup_job_status(self, job_id, parent_job_id=None):
        """
        Will raise a ValueError if job_id doesn't exist.
        Sends the status over the comm channel as the usual job_status message.
        """

        # if parent_job is real, and job_id (the child) is not, just add it to the
        # list of running jobs and work as normal.
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        job = self._running_jobs.get(job_id, {}).get('job', None)
        state = self._get_job_state(job_id)
        status = self._construct_job_status(job, state)
        self._send_comm_message('job_status', status)

    def _lookup_job_info(self, job_id, parent_job_id=None):
        """
        Will raise a ValueError if job_id doesn't exist.
        Sends the info over the comm channel as this packet:
        {
            app_id: module/name,
            app_name: random string,
            job_id: string,
            job_params: dictionary
        }
        """
        # if parent_job is real, and job_id (the child) is not, just add it to the
        # list of running jobs and work as normal.
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        job = self.get_job(job_id)
        info = {
            'app_id': job.app_id,
            'app_name': job.app_spec()['info']['name'],
            'job_id': job_id,
            'job_params': job.inputs
        }
        self._send_comm_message('job_info', info)

    def _lookup_all_job_status(self, ignore_refresh_flag=False):
        """
        Looks up status for all jobs.
        Once job info is acquired, it gets pushed to the front end over the
        'KBaseJobs' channel.
        """
        jobs_to_lookup = list()
        # grab the list of running job ids, so we don't run into update-while-iterating problems.
        for job_id in self._running_jobs.keys():
            if self._running_jobs[job_id]['refresh'] > 0 or ignore_refresh_flag:
                jobs_to_lookup.append(job_id)

        if len(jobs_to_lookup) > 0:
            status_set = self._construct_job_status_set(jobs_to_lookup)
            self._send_comm_message('job_status_all', status_set)

        return len(jobs_to_lookup)

    def _start_job_status_loop(self):
        kblogging.log_event(self._log, 'starting job status loop', {})
        if self._lookup_timer is None:
            self._lookup_job_status_loop()

    def _lookup_job_status_loop(self):
        """
        Initialize a loop that will look up job info. This uses a Timer thread on a 10
        second loop to update things.
        """

        refreshing_jobs = self._lookup_all_job_status()
        # Automatically stop when there are no more jobs requesting a refresh.
        if refreshing_jobs == 0:
            self.cancel_job_lookup_loop()
        else:
            self._lookup_timer = threading.Timer(10, self._lookup_job_status_loop)
            self._lookup_timer.start()

    def cancel_job_lookup_loop(self):
        """
        Cancels a running timer if one's still alive.
        """
        if self._lookup_timer:
            self._lookup_timer.cancel()
            self._lookup_timer = None
        self._running_lookup_loop = False

    def register_new_job(self, job):
        """
        Registers a new Job with the manager - should only be invoked when a new Job gets
        started. This stores the Job locally and pushes it over the comm channel to the
        Narrative where it gets serialized.

        Parameters:
        -----------
        job : biokbase.narrative.jobs.job.Job object
            The new Job that was started.
        """
        self._running_jobs[job.job_id] = {'job': job, 'refresh': 0}
        # push it forward! create a new_job message.
        self._lookup_job_status(job.job_id)
        self._send_comm_message('new_job', {
            'job_id': job.job_id
        })

    def get_job(self, job_id):
        """
        Returns a Job with the given job_id.
        Raises a ValueError if not found.
        """
        if job_id in self._running_jobs:
            return self._running_jobs[job_id]['job']
        else:
            raise ValueError('No job present with id {}'.format(job_id))

    def _handle_comm_message(self, msg):
        """
        Handles comm messages that come in from the other end of the KBaseJobs channel.
        All messages (of any use) should have a 'request_type' property.
        Possible types:
        * all_status
            refresh all jobs that are flagged to be looked up. Will send a
            message back with all lookup status.
        * job_status
            refresh the single job given in the 'job_id' field. Sends a message
            back with that single job's status, or an error message.
        * stop_update_loop
            stop the running refresh loop, if there's one going (might be
            one more pass, depending on the thread state)
        * start_update_loop
            reinitialize the refresh loop.
        * stop_job_update
            flag the given job id (should be an accompanying 'job_id' field) that the front
            end knows it's in a terminal state and should no longer have its status looked
            up in the refresh cycle.
        * start_job_update
            remove the flag that gets set by stop_job_update (needs an accompanying 'job_id'
            field)
        * job_info
            from the given 'job_id' field, returns some basic info about the job, including the app
            id, version, app name, and key-value pairs for inputs and parameters (in the parameters
            id namespace specified by the app spec).
        """

        if 'request_type' in msg['content']['data']:
            r_type = msg['content']['data']['request_type']
            job_id = msg['content']['data'].get('job_id', None)
            parent_job_id = msg['content']['data'].get('parent_job_id', None)
            if job_id is not None and job_id not in self._running_jobs and not parent_job_id:
                # If it's not a real job, just silently ignore the request.
                # Unless it has a parent job id, then its a child job, so things get muddled. If there's 100+ child jobs,
                # then this might get tricky to look up all of them. Let it pass through and fail if it's not real.
                #
                # TODO: perhaps we should implement request/response here. All we really need is to thread a message
                # id through
                self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'request_type': r_type})
                return
            elif parent_job_id is not None:
                try:
                    self._verify_job_parentage(parent_job_id, job_id)
                except ValueError as e:
                    self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'parent_job_id': parent_job_id, 'request_type': r_type})

            if r_type == 'all_status':
                self._lookup_all_job_status(ignore_refresh_flag=True)

            elif r_type == 'job_status':
                if job_id is not None:
                    self._lookup_job_status(job_id, parent_job_id=parent_job_id)

            elif r_type == 'job_info':
                if job_id is not None:
                    self._lookup_job_info(job_id, parent_job_id=parent_job_id)

            elif r_type == 'stop_update_loop':
                self.cancel_job_lookup_loop()

            elif r_type == 'start_update_loop':
                self._start_job_status_loop()

            elif r_type == 'stop_job_update':
                if job_id is not None:
                    if self._running_jobs[job_id]['refresh'] > 0:
                        self._running_jobs[job_id]['refresh'] -= 1

            elif r_type == 'start_job_update':
                if job_id is not None:
                    self._running_jobs[job_id]['refresh'] += 1
                    self._start_job_status_loop()

            elif r_type == 'delete_job':
                if job_id is not None:
                    try:
                        self.delete_job(job_id, parent_job_id=parent_job_id)
                    except Exception as e:
                        self._send_comm_message('job_comm_error', {'message': str(e), 'request_type': r_type, 'job_id': job_id})

            elif r_type == 'cancel_job':
                if job_id is not None:
                    try:
                        self.cancel_job(job_id, parent_job_id=parent_job_id)
                    except Exception as e:
                        self._send_comm_message('job_comm_error', {'message': str(e), 'request_type': r_type, 'job_id': job_id})

            elif r_type == 'job_logs':
                if job_id is not None:
                    first_line = msg['content']['data'].get('first_line', 0)
                    num_lines = msg['content']['data'].get('num_lines', None)
                    self._get_job_logs(job_id, parent_job_id=parent_job_id, first_line=first_line, num_lines=num_lines)
                else:
                    raise ValueError('Need a job id to fetch jobs!')

            elif r_type == 'job_logs_latest':
                if job_id is not None:
                    num_lines = msg['content']['data'].get('num_lines', None)
                    try:
                        self._get_latest_job_logs(job_id, parent_job_id=parent_job_id, num_lines=num_lines)
                    except Exception as e:
                        self._send_comm_message('job_comm_error', {
                            'job_id': job_id,
                            'message': str(e),
                            'request_type': r_type})
                else:
                    raise ValueError('Need a job id to fetch jobs!')

            else:
                self._send_comm_message('job_comm_error', {'message': 'Unknown message', 'request_type': r_type})
                raise ValueError('Unknown KBaseJobs message "{}"'.format(r_type))

    def _get_latest_job_logs(self, job_id, parent_job_id=None, num_lines=None):
        job = self.get_job(job_id)
        if job is None:
            raise ValueError('job "{}" not found while fetching logs!'.format(job_id))

        (max_lines, logs) = job.log()

        first_line = 0
        if num_lines is not None and max_lines > num_lines:
            first_line = max_lines - num_lines
            logs = logs[first_line:]
        self._send_comm_message('job_logs', {
            'job_id': job_id,
            'first': first_line,
            'max_lines': max_lines,
            'lines': logs,
            'latest': True})

    def _get_job_logs(self, job_id, parent_job_id=None, first_line=0, num_lines=None):
        # if parent_job is real, and job_id (the child) is not, just add it to the
        # list of running jobs and work as normal.

        job = self.get_job(job_id)
        if job is None:
            raise ValueError('job "{}" not found!'.format(job_id))

        (max_lines, log_slice) = job.log(first_line=first_line, num_lines=num_lines)
        self._send_comm_message('job_logs', {'job_id': job_id, 'first': first_line, 'max_lines': max_lines, 'lines': log_slice, 'latest': False})

    def delete_job(self, job_id, parent_job_id=None):
        """
        If the job_id doesn't exist, raises a ValueError.
        Attempts to delete a job, and cancels it first. If the job cannot be canceled,
        raises an exception. If it can be canceled but not deleted, it gets canceled, then raises
        an exception.
        """
        if job_id is None:
            raise ValueError('Job id required for deletion!')
        if not parent_job_id and job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'delete_job'})
            return
            # raise ValueError('Attempting to cancel a Job that does not exist!')

        try:
            self.cancel_job(job_id, parent_job_id=parent_job_id)
        except Exception:
            raise

        try:
            clients.get('user_and_job_state').delete_job(job_id)
        except Exception:
            raise

        if job_id in self._running_jobs:
            del self._running_jobs[job_id]
        if job_id in self._completed_job_states:
            del self._completed_job_states[job_id]
        self._send_comm_message('job_deleted', {'job_id': job_id})

    def cancel_job(self, job_id, parent_job_id=None):
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        Raises an exception if the current user doesn't have permission to cancel the job.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if not parent_job_id and job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'cancel_job'})
            return

        try:
            state = self._get_job_state(job_id, parent_job_id=parent_job_id)
            if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise ValueError('Unable to get Job state')

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        if not parent_job_id:
            is_refreshing = self._running_jobs[job_id].get('refresh', 0)
            self._running_jobs[job_id]['refresh'] = 0
            self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('job_service').cancel_job({'job_id': job_id})
        except Exception as e:
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get cancel job',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'request_type': 'cancel_job',
                'job_id': job_id
            }
            self._send_comm_message('job_comm_error', error)
            raise(e)
        finally:
            if not parent_job_id:
                self._running_jobs[job_id]['refresh'] = is_refreshing
                del self._running_jobs[job_id]['canceling']

        # Rather than a separate message, how about triggering a job-status message:
        self._lookup_job_status(job_id, parent_job_id=parent_job_id)

    def _send_comm_message(self, msg_type, content):
        """
        Sends a ipykernel.Comm message to the KBaseJobs channel with the given msg_type
        and content. These just get encoded into the message itself.
        """
        msg = {
            'msg_type': msg_type,
            'content': content
        }
        if self._comm is None:
            self._comm = Comm(target_name='KBaseJobs', data={})
            self._comm.on_msg(self._handle_comm_message)
        self._comm.send(msg)

    def _get_all_job_states(self, job_ids=None):
        """
        Returns the state for all running jobs
        """
        # 1. Get list of ids
        if job_ids is None:
            job_ids = self._running_jobs.keys()
        # 1.5 Go through job ids and remove ones that aren't found.
        job_ids = [j for j in job_ids if j in self._running_jobs]
        # 2. Foreach, check if in completed cache. If so, grab the status. If not, enqueue id
        # for batch lookup.
        job_states = dict()
        jobs_to_lookup = list()
        for job_id in job_ids:
            if job_id in self._completed_job_states:
                job_states[job_id] = dict(self._completed_job_states[job_id])
            else:
                jobs_to_lookup.append(job_id)
        # 3. Lookup those jobs what need it. Cache 'em as we go, if finished.
        try:
            fetched_states = clients.get('job_service').check_jobs({'job_ids': jobs_to_lookup})
        except Exception as e:
            kblogging.log_event(self._log, 'get_all_job_states_error', {'err': str(e)})
            return {}

        error_states = fetched_states.get('check_errors', {})
        fetched_states = fetched_states.get('job_states', {})
        for job_id in jobs_to_lookup:
            if job_id in fetched_states:
                state = fetched_states[job_id]
                state['cell_id'] = self._running_jobs[job_id]['job'].cell_id
                state['run_id'] = self._running_jobs[job_id]['job'].run_id
                if state.get('finished', 0) == 1:
                    self._completed_job_states[state['job_id']] = dict(state)
                job_states[state['job_id']] = state
            elif job_id in error_states:
                error = error_states[job_id]
                job_states[state['job_id']] = {'lookup_error': error}

        return job_states

    def _get_job_state(self, job_id, parent_job_id=None):
        if parent_job_id is not None:
            self._verify_job_parentage(parent_job_id, job_id)
        if job_id is None or job_id not in self._running_jobs:
            raise ValueError('job_id {} not found'.format(job_id))
        if job_id in self._completed_job_states:
            return dict(self._completed_job_states[job_id])
        state = self._running_jobs[job_id]['job'].state()
        if state.get('finished', 0) == 1:
            self._completed_job_states[job_id] = dict(state)
        return dict(state)
Exemplo n.º 16
0
import urllib
import tornado.log
from traitlets.config import Application
from biokbase.auth import (
    get_user_info,
    init_session_env
)
from biokbase.narrative.common.url_config import URLS

HTTPError = web.HTTPError

app_log = tornado.log.app_log  # alias
if Application.initialized:
    app_log = Application.instance().log

g_log = get_logger("biokbase.narrative")
auth_cookie_name = "kbase_session"

def _init_session(request, cookies):
    client_ip = request.remote_ip
    http_headers = request.headers
    ua = http_headers.get('User-Agent', 'unknown')
    auth_cookie = cookies.get(auth_cookie_name)
    if auth_cookie is not None:
        token = urllib.unquote(auth_cookie.value)
    else:
        raise web.HTTPError(status_code=401,
                            log_message='No auth cookie, denying access',
                            reason='Authorization required for Narrative access')
    if token != kbase_env.auth_token:
        init_session_env(get_user_info(token), client_ip)
Exemplo n.º 17
0
from unicodedata import normalize

from tornado import web
from pymongo import MongoClient
from pymongo.read_preferences import ReadPreference

from IPython.html.services.notebooks.nbmanager import NotebookManager
#from IPython.config.configurable import LoggingConfigurable
from IPython.nbformat import current
from IPython.utils.traitlets import Unicode, Dict, Bool, List, TraitError
#from IPython.utils import tz

# To log narrative itself
from biokbase.narrative.common import kblogging

g_log = kblogging.get_logger("narrative.base")



#-----------------------------------------------------------------------------
# Classes
#-----------------------------------------------------------------------------

class MongoNotebookManager(NotebookManager):

    # The MongoDB backend simply wraps the JSON notebook in a enclosing dict
    # and pushes it into MongoDB. The dict has the following fields
    # {
    #     '_id' : {mongodb UUID - we set it manually using notebook_id},
    #     'owner' : {username of the owner of this notebook},
    #     'doc_type' : (ipynb),