Пример #1
0
    def host_file(self, file_path):
        """
        Hosts an intermediate file between :term:`Workflow` steps for processes that require external or remote access.

        :param file_path: Intermediate file location (local path expected).
        :return: Hosted temporary HTTP file location.
        """
        wps_out_url = get_wps_output_url(self.settings)
        wps_out_dir = get_wps_output_dir(self.settings)
        file_path = os.path.realpath(file_path.replace(
            "file://", ""))  # in case CWL->WPS outputs link was made
        if file_path.startswith(wps_out_dir):
            file_href = file_path.replace(wps_out_dir, wps_out_url, 1)
            LOGGER.debug(
                "Hosting file [%s] skipped since already on WPS outputs as [%s]",
                file_path, file_href)
        else:
            tmp_out_dir = tempfile.mkdtemp(dir=wps_out_dir)
            file_link = fetch_file(file_path,
                                   tmp_out_dir,
                                   self.settings,
                                   link=True)
            file_href = file_link.replace(wps_out_dir, wps_out_url, 1)
            self.temp_staging.add(tmp_out_dir)
            LOGGER.debug("Hosting file [%s] as [%s] on [%s]", file_path,
                         file_link, file_href)
        return file_href
Пример #2
0
def mocked_wps_output(
    settings,  # type: SettingsType
    mock_get=True,  # type: bool
    mock_head=True,  # type: bool
    headers_override=None,  # type: Optional[AnyHeadersContainer]
    requests_mock=None,  # type: Optional[responses.RequestsMock]
):  # type: (...) -> Union[responses.RequestsMock, MockPatch]
    """
    Mocks the mapping resolution from HTTP WPS output URL to hosting of matched local file in WPS output directory.

    .. warning::
        When combined in a test where :func:`mocked_sub_requests` is employed, parameter ``local_only=True`` must be
        provided. Furthermore, the endpoint corresponding to ``weaver.wps_output_url`` would be different than the
        :class:`TestApp` URL (typically ``https://localhost``). Simply changing ``https`` to ``http`` can be sufficient.
        Without those modifications, this mocked response will never be reached since HTTP requests themselves would
        be mocked beforehand by the :class:`TestApp` request.

    .. seealso::
        This case is a specific use of :func:`mocked_file_server` for auto-mapping endpoint/directory of WPS outputs.

    :param settings: Application settings to retrieve WPS output configuration.
    :param mock_get: Whether to mock HTTP GET methods received on WPS output URL.
    :param mock_head: Whether to mock HTTP HEAD methods received on WPS output URL.
    :param headers_override: Override specified headers in produced response.
    :param requests_mock: Previously defined request mock instance to extend with new definitions.
    :return: Mocked response that would normally be obtained by a file server hosting WPS output directory.
    """
    wps_url = get_wps_output_url(settings)
    wps_dir = get_wps_output_dir(settings)
    return mocked_file_server(wps_dir, wps_url, settings, mock_get, mock_head,
                              headers_override, requests_mock)
Пример #3
0
def dismiss_job_task(job, container):
    # type: (Job, AnySettingsContainer) -> Job
    """
    Cancels any pending or running :mod:`Celery` task and removes completed job artifacts.

    .. note::
        The :term:`Job` object itself is not deleted, only its artifacts.
        Therefore, its inputs, outputs, logs, exceptions, etc. are still available in the database,
        but corresponding files that would be exposed by ``weaver.wps_output`` configurations are removed.

    :param job: Job to cancel or cleanup.
    :param container: Application settings.
    :return: Updated and dismissed job.
    """
    raise_job_dismissed(job, container)
    if job.status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]:
        # signal to stop celery task. Up to it to terminate remote if any.
        LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id,
                     job.task_id)
        celery_app.control.revoke(job.task_id, terminate=True)

    wps_out_dir = get_wps_output_dir(container)
    job_out_dir = os.path.join(wps_out_dir, str(job.id))
    job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log")
    job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml")
    if os.path.isdir(job_out_dir):
        LOGGER.debug("Job [%s] dismiss operation: Removing output results.",
                     job.id)
        shutil.rmtree(
            job_out_dir,
            onerror=lambda func, path, _exc: LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_dir, _exc))
    if os.path.isfile(job_out_log):
        LOGGER.debug("Job [%s] dismiss operation: Removing output logs.",
                     job.id)
        try:
            os.remove(job_out_log)
        except OSError as exc:
            LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_log, exc)
    if os.path.isfile(job_out_xml):
        LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.",
                     job.id)
        try:
            os.remove(job_out_xml)
        except OSError as exc:
            LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_xml, exc)

    LOGGER.debug("Job [%s] dismiss operation: Updating job status.")
    store = get_db(container).get_store(StoreJobs)
    job.status_message = f"Job {Status.DISMISSED}."
    job.status = map_status(Status.DISMISSED)
    job = store.update_job(job)
    return job
Пример #4
0
def dismiss_job_task(job, container):
    # type: (Job, AnySettingsContainer) -> Job
    """
    Cancels any pending or running :mod:`Celery` task and removes completed job artifacts.

    :param job: job to cancel or cleanup.
    :param container:
    :return:
    """
    raise_job_dismissed(job, container)
    if job.status in status.JOB_STATUS_CATEGORIES[
            status.JOB_STATUS_CATEGORY_RUNNING]:
        # signal to stop celery task. Up to it to terminate remote if any.
        LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id,
                     job.task_id)
        celery_app.control.revoke(job.task_id, terminate=True)

    wps_out_dir = get_wps_output_dir(container)
    job_out_dir = os.path.join(wps_out_dir, str(job.id))
    job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log")
    job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml")
    if os.path.isdir(job_out_dir):
        LOGGER.debug("Job [%s] dismiss operation: Removing output results.",
                     job.id)
        shutil.rmtree(
            job_out_dir,
            onerror=lambda func, path, _exc: LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_dir, _exc))
    if os.path.isfile(job_out_log):
        LOGGER.debug("Job [%s] dismiss operation: Removing output logs.",
                     job.id)
        try:
            os.remove(job_out_log)
        except OSError as exc:
            LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_log, exc)
    if os.path.isfile(job_out_xml):
        LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.",
                     job.id)
        try:
            os.remove(job_out_xml)
        except OSError as exc:
            LOGGER.warning(
                "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]",
                job.id, job_out_xml, exc)

    LOGGER.debug("Job [%s] dismiss operation: Updating job status.")
    store = get_db(container).get_store(StoreJobs)
    job.status_message = "Job {}.".format(status.STATUS_DISMISSED)
    job.status = status.map_status(status.STATUS_DISMISSED)
    job = store.update_job(job)
    return job
Пример #5
0
    def host_file(file_name):
        settings = get_settings(app)
        weaver_output_url = get_wps_output_url(settings)
        weaver_output_dir = get_wps_output_dir(settings)
        file_name = file_name.replace("file://", "")

        if not file_name.startswith(weaver_output_dir):
            raise Exception(
                "Cannot host files outside of the output path : {0}".format(
                    file_name))
        return file_name.replace(weaver_output_dir, weaver_output_url)
Пример #6
0
    def execute(self, identifier, wps_request, uuid):
        # type: (str, Union[WPSRequest, WorkerRequest], str) -> Union[WPSResponse, HTTPValid]
        """
        Handles the ``Execute`` KVP/XML request submitted on the WPS endpoint.

        Submit WPS request to corresponding WPS-REST endpoint and convert back for requested ``Accept`` content-type.

        Overrides the original execute operation, that will instead be handled by :meth:`execute_job` following
        callback from Celery Worker, which handles process job creation and monitoring.

        If ``Accept`` is JSON, the result is directly returned from :meth:`_submit_job`.
        If ``Accept`` is XML or undefined, :class:`WorkerExecuteResponse` converts the received JSON with XML template.
        """
        result = self._submit_job(wps_request)
        if not isinstance(result, dict):
            return result  # pre-built HTTP response with JSON contents when requested

        # otherwise, recreate the equivalent content with expected XML template format
        job_id = result["jobID"]
        wps_process = self.processes.get(wps_request.identifier)

        # because we are building the XML response (and JSON not explicitly requested)
        # caller is probably a WPS-1 client also expecting a status XML file
        # remap the status location accordingly from the current REST endpoint
        job_url = result["location"]
        if urlparse(job_url).path.endswith(f"/jobs/{job_id}"):
            # file status does not exist yet since client calling this method is waiting for it
            # pywps will generate it once the WorkerExecuteResponse is returned
            status_path = get_wps_local_status_location(job_url,
                                                        self.settings,
                                                        must_exist=False)
            wps_dir = get_wps_output_dir(self.settings)
            wps_url = get_wps_output_url(self.settings)
            job_url = status_path.replace(wps_dir, wps_url, 1)

        # when called by the WSGI app, 'WorkerExecuteResponse.__call__' will generate the XML from 'doc' property,
        # which itself is generated by template substitution of data from above 'json' property
        try:
            return WorkerExecuteResponse(wps_request,
                                         job_id,
                                         wps_process,
                                         job_url,
                                         settings=self.settings)
        except Exception as ex:  # noqa
            LOGGER.exception(
                "Error building XML response by PyWPS Service during WPS Execute result from worker."
            )
            message = f"Failed building XML response from WPS Execute result. Error [{ex!r}]"
            raise OWSNoApplicableCode(message, locator=job_id)
Пример #7
0
def make_result_link(result_id, result, job_id, settings):
    # type: (str, Union[ExecutionResultObject, ExecutionResultArray], AnyUUID, SettingsType) -> List[str]
    """
    Convert a result definition as ``value`` into the corresponding ``reference`` for output transmission.

    .. seealso::
        :rfc:`8288`: HTTP ``Link`` header specification.
    """
    values = result if isinstance(result, list) else [result]
    suffixes = list(
        f".{idx}"
        for idx in range(len(values))) if isinstance(result, list) else [""]
    wps_url = get_wps_output_url(settings).strip("/")
    links = []
    for suffix, value in zip(suffixes, values):
        key = get_any_value(result, key=True)
        if key != "href":
            # literal data to be converted to link
            # plain text file must be created containing the raw literal data
            typ = ContentType.TEXT_PLAIN  # as per '/rec/core/process-execute-sync-document-ref'
            enc = "UTF-8"
            out = get_wps_output_dir(settings)
            val = get_any_value(value, data=True, file=False)
            loc = os.path.join(job_id, result_id + suffix + ".txt")
            url = f"{wps_url}/{loc}"
            path = os.path.join(out, loc)
            with open(path, mode="w", encoding=enc) as out_file:
                out_file.write(val)
        else:
            fmt = get_field(result,
                            "format",
                            default={"mediaType": ContentType.TEXT_PLAIN})
            typ = get_field(fmt,
                            "mime_type",
                            search_variations=True,
                            default=ContentType.TEXT_PLAIN)
            enc = get_field(fmt,
                            "encoding",
                            search_variations=True,
                            default=None)
            url = get_any_value(value, data=False,
                                file=True)  # should already include full path
            if fmt == ContentType.TEXT_PLAIN and not enc:  # only if text, otherwise binary content could differ
                enc = "UTF-8"  # default both omit/empty
        encoding = f"; charset={enc}" if enc else ""
        links.append(
            f"<{url}>; rel=\"{result_id}{suffix}\"; type={typ}{encoding}")
    return links
Пример #8
0
    def job(
        self,
        joborder,  # type: Dict[Text, AnyValue]
        output_callbacks,  # type: Callable[[Any, Any], Any]
        runtime_context,  # type: RuntimeContext
    ):  # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None]
        """
        Workflow job generator.

        :param joborder: inputs of the job submission
        :param output_callbacks: method to fetch step outputs and corresponding step details
        :param runtime_context: configs about execution environment
        :return:
        """
        require_prefix = ""
        if self.metadata["cwlVersion"] == "v1.0":
            require_prefix = "http://commonwl.org/cwltool#"

        jobname = uniquename(runtime_context.name
                             or shortname(self.tool.get("id", "job")))

        # outdir must be served by the EMS because downstream step will need access to upstream steps output
        weaver_out_dir = get_wps_output_dir(get_settings(app))
        runtime_context.outdir = tempfile.mkdtemp(prefix=getdefault(
            runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX),
                                                  dir=weaver_out_dir)
        builder = self._init_job(joborder, runtime_context)

        # `jobname` is the step name and `joborder` is the actual step inputs
        wps_workflow_job = WpsWorkflowJob(
            builder, builder.job, self.requirements, self.hints, jobname,
            self.get_job_process_definition(jobname, joborder, self.tool),
            self.tool["outputs"])
        wps_workflow_job.prov_obj = self.prov_obj
        wps_workflow_job.successCodes = self.tool.get("successCodes")
        wps_workflow_job.temporaryFailCodes = self.tool.get(
            "temporaryFailCodes")
        wps_workflow_job.permanentFailCodes = self.tool.get(
            "permanentFailCodes")

        # TODO Taken from command_line_tool.py maybe this could let us use the revmap if required at all
        # reffiles = copy.deepcopy(builder.files)
        # builder.pathmapper = self.make_path_mapper(
        #     reffiles, builder.stagedir, runtimeContext, True)
        # builder.requirements = wps_workflow_job.requirements

        wps_workflow_job.outdir = builder.outdir
        wps_workflow_job.tmpdir = builder.tmpdir
        wps_workflow_job.stagedir = builder.stagedir

        readers = {}  # type: Dict[Text, Any]
        timelimit = self.get_requirement(require_prefix + "TimeLimit")[0]
        if timelimit:
            with SourceLine(timelimit, "timelimit",
                            validate.ValidationException):
                wps_workflow_job.timelimit = builder.do_eval(
                    timelimit["timelimit"])
                if not isinstance(wps_workflow_job.timelimit,
                                  int) or wps_workflow_job.timelimit < 0:
                    raise Exception(
                        "timelimit must be an integer >= 0, got: %s" %
                        wps_workflow_job.timelimit)

        wps_workflow_job.collect_outputs = partial(
            self.collect_output_ports,
            self.tool["outputs"],
            builder,
            compute_checksum=getdefault(runtime_context.compute_checksum,
                                        True),
            jobname=jobname,
            readers=readers)
        wps_workflow_job.output_callback = output_callbacks

        yield wps_workflow_job
Пример #9
0
def collect_statistics(process, settings=None, job=None, rss_start=None):
    # type: (Optional[psutil.Process], Optional[SettingsType], Optional[Job], Optional[int]) -> Optional[Statistics]
    """
    Collect any available execution statistics and store them in the :term:`Job` if provided.
    """
    try:
        mem_used = None
        if job:
            mem_info = list(
                filter(
                    lambda line: "cwltool" in line and "memory used" in line,
                    job.logs))
            mem_used = None
            if mem_info:
                mem_info = mem_info[0].split(":")[-1].strip()
                mem_used = parse_number_with_unit(mem_info, binary=True)

        stats = {}  # type: JSON
        if mem_used:
            stats["application"] = {
                # see: 'cwltool.job.JobBase.process_monitor', reported memory in logs uses 'rss'
                "usedMemory": apply_number_with_unit(mem_used, binary=True),
                "usedMemoryBytes": mem_used,
            }

        rss = None
        if process:
            proc_info = process.memory_full_info()
            rss = getattr(proc_info, "rss", 0)
            uss = getattr(proc_info, "uss", 0)
            vms = getattr(proc_info, "vms", 0)
            stats["process"] = {
                "rss": apply_number_with_unit(rss, binary=True),
                "rssBytes": rss,
                "uss": apply_number_with_unit(uss, binary=True),
                "ussBytes": uss,
                "vms": apply_number_with_unit(vms, binary=True),
                "vmsBytes": vms,
            }
            fields = [("usedThreads", "num_threads"), ("usedCPU", "cpu_num"),
                      ("usedHandles", "num_handles")]
            for field, method in fields:
                func = getattr(process, method, None)
                stats["process"][field] = func() if func is not None else 0

        if rss_start and rss:
            # diff of RSS between start/end to consider only execution of the job steps
            # this more accurately reports used memory by the execution itself, omitting celery worker's base memory
            rss_diff = rss - rss_start
            stats["process"]["usedMemory"] = apply_number_with_unit(
                rss_diff, binary=True)
            stats["process"]["usedMemoryBytes"] = rss_diff

        total_size = 0
        if job:
            stats["outputs"] = {}
            for result in job.results:
                res_ref = get_any_value(result, file=True)
                if res_ref and isinstance(res_ref, str):
                    if res_ref.startswith(
                            f"/{job.id}"):  # pseudo-relative reference
                        out_dir = get_wps_output_dir(settings)
                        res_ref = os.path.join(out_dir, res_ref[1:])
                    if os.path.isfile(res_ref):
                        res_stat = os.stat(res_ref)
                        res_id = get_any_id(result)
                        res_size = res_stat.st_size
                        stats["outputs"][res_id] = {
                            "size": apply_number_with_unit(res_size,
                                                           binary=True),
                            "sizeBytes": res_size,
                        }
                        total_size += res_size
            stats["process"]["totalSize"] = apply_number_with_unit(total_size,
                                                                   binary=True)
            stats["process"]["totalSizeBytes"] = total_size

        if stats and job:
            job.statistics = stats
        return stats or None
    except Exception as exc:  # pragma: no cover
        LOGGER.warning(
            "Ignoring error that occurred during statistics collection [%s]",
            str(exc),
            exc_info=exc)