def host_file(self, file_path): """ Hosts an intermediate file between :term:`Workflow` steps for processes that require external or remote access. :param file_path: Intermediate file location (local path expected). :return: Hosted temporary HTTP file location. """ wps_out_url = get_wps_output_url(self.settings) wps_out_dir = get_wps_output_dir(self.settings) file_path = os.path.realpath(file_path.replace( "file://", "")) # in case CWL->WPS outputs link was made if file_path.startswith(wps_out_dir): file_href = file_path.replace(wps_out_dir, wps_out_url, 1) LOGGER.debug( "Hosting file [%s] skipped since already on WPS outputs as [%s]", file_path, file_href) else: tmp_out_dir = tempfile.mkdtemp(dir=wps_out_dir) file_link = fetch_file(file_path, tmp_out_dir, self.settings, link=True) file_href = file_link.replace(wps_out_dir, wps_out_url, 1) self.temp_staging.add(tmp_out_dir) LOGGER.debug("Hosting file [%s] as [%s] on [%s]", file_path, file_link, file_href) return file_href
def mocked_wps_output( settings, # type: SettingsType mock_get=True, # type: bool mock_head=True, # type: bool headers_override=None, # type: Optional[AnyHeadersContainer] requests_mock=None, # type: Optional[responses.RequestsMock] ): # type: (...) -> Union[responses.RequestsMock, MockPatch] """ Mocks the mapping resolution from HTTP WPS output URL to hosting of matched local file in WPS output directory. .. warning:: When combined in a test where :func:`mocked_sub_requests` is employed, parameter ``local_only=True`` must be provided. Furthermore, the endpoint corresponding to ``weaver.wps_output_url`` would be different than the :class:`TestApp` URL (typically ``https://localhost``). Simply changing ``https`` to ``http`` can be sufficient. Without those modifications, this mocked response will never be reached since HTTP requests themselves would be mocked beforehand by the :class:`TestApp` request. .. seealso:: This case is a specific use of :func:`mocked_file_server` for auto-mapping endpoint/directory of WPS outputs. :param settings: Application settings to retrieve WPS output configuration. :param mock_get: Whether to mock HTTP GET methods received on WPS output URL. :param mock_head: Whether to mock HTTP HEAD methods received on WPS output URL. :param headers_override: Override specified headers in produced response. :param requests_mock: Previously defined request mock instance to extend with new definitions. :return: Mocked response that would normally be obtained by a file server hosting WPS output directory. """ wps_url = get_wps_output_url(settings) wps_dir = get_wps_output_dir(settings) return mocked_file_server(wps_dir, wps_url, settings, mock_get, mock_head, headers_override, requests_mock)
def dismiss_job_task(job, container): # type: (Job, AnySettingsContainer) -> Job """ Cancels any pending or running :mod:`Celery` task and removes completed job artifacts. .. note:: The :term:`Job` object itself is not deleted, only its artifacts. Therefore, its inputs, outputs, logs, exceptions, etc. are still available in the database, but corresponding files that would be exposed by ``weaver.wps_output`` configurations are removed. :param job: Job to cancel or cleanup. :param container: Application settings. :return: Updated and dismissed job. """ raise_job_dismissed(job, container) if job.status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]: # signal to stop celery task. Up to it to terminate remote if any. LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id, job.task_id) celery_app.control.revoke(job.task_id, terminate=True) wps_out_dir = get_wps_output_dir(container) job_out_dir = os.path.join(wps_out_dir, str(job.id)) job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log") job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml") if os.path.isdir(job_out_dir): LOGGER.debug("Job [%s] dismiss operation: Removing output results.", job.id) shutil.rmtree( job_out_dir, onerror=lambda func, path, _exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_dir, _exc)) if os.path.isfile(job_out_log): LOGGER.debug("Job [%s] dismiss operation: Removing output logs.", job.id) try: os.remove(job_out_log) except OSError as exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_log, exc) if os.path.isfile(job_out_xml): LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.", job.id) try: os.remove(job_out_xml) except OSError as exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_xml, exc) LOGGER.debug("Job [%s] dismiss operation: Updating job status.") store = get_db(container).get_store(StoreJobs) job.status_message = f"Job {Status.DISMISSED}." job.status = map_status(Status.DISMISSED) job = store.update_job(job) return job
def dismiss_job_task(job, container): # type: (Job, AnySettingsContainer) -> Job """ Cancels any pending or running :mod:`Celery` task and removes completed job artifacts. :param job: job to cancel or cleanup. :param container: :return: """ raise_job_dismissed(job, container) if job.status in status.JOB_STATUS_CATEGORIES[ status.JOB_STATUS_CATEGORY_RUNNING]: # signal to stop celery task. Up to it to terminate remote if any. LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id, job.task_id) celery_app.control.revoke(job.task_id, terminate=True) wps_out_dir = get_wps_output_dir(container) job_out_dir = os.path.join(wps_out_dir, str(job.id)) job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log") job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml") if os.path.isdir(job_out_dir): LOGGER.debug("Job [%s] dismiss operation: Removing output results.", job.id) shutil.rmtree( job_out_dir, onerror=lambda func, path, _exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_dir, _exc)) if os.path.isfile(job_out_log): LOGGER.debug("Job [%s] dismiss operation: Removing output logs.", job.id) try: os.remove(job_out_log) except OSError as exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_log, exc) if os.path.isfile(job_out_xml): LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.", job.id) try: os.remove(job_out_xml) except OSError as exc: LOGGER.warning( "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_xml, exc) LOGGER.debug("Job [%s] dismiss operation: Updating job status.") store = get_db(container).get_store(StoreJobs) job.status_message = "Job {}.".format(status.STATUS_DISMISSED) job.status = status.map_status(status.STATUS_DISMISSED) job = store.update_job(job) return job
def host_file(file_name): settings = get_settings(app) weaver_output_url = get_wps_output_url(settings) weaver_output_dir = get_wps_output_dir(settings) file_name = file_name.replace("file://", "") if not file_name.startswith(weaver_output_dir): raise Exception( "Cannot host files outside of the output path : {0}".format( file_name)) return file_name.replace(weaver_output_dir, weaver_output_url)
def execute(self, identifier, wps_request, uuid): # type: (str, Union[WPSRequest, WorkerRequest], str) -> Union[WPSResponse, HTTPValid] """ Handles the ``Execute`` KVP/XML request submitted on the WPS endpoint. Submit WPS request to corresponding WPS-REST endpoint and convert back for requested ``Accept`` content-type. Overrides the original execute operation, that will instead be handled by :meth:`execute_job` following callback from Celery Worker, which handles process job creation and monitoring. If ``Accept`` is JSON, the result is directly returned from :meth:`_submit_job`. If ``Accept`` is XML or undefined, :class:`WorkerExecuteResponse` converts the received JSON with XML template. """ result = self._submit_job(wps_request) if not isinstance(result, dict): return result # pre-built HTTP response with JSON contents when requested # otherwise, recreate the equivalent content with expected XML template format job_id = result["jobID"] wps_process = self.processes.get(wps_request.identifier) # because we are building the XML response (and JSON not explicitly requested) # caller is probably a WPS-1 client also expecting a status XML file # remap the status location accordingly from the current REST endpoint job_url = result["location"] if urlparse(job_url).path.endswith(f"/jobs/{job_id}"): # file status does not exist yet since client calling this method is waiting for it # pywps will generate it once the WorkerExecuteResponse is returned status_path = get_wps_local_status_location(job_url, self.settings, must_exist=False) wps_dir = get_wps_output_dir(self.settings) wps_url = get_wps_output_url(self.settings) job_url = status_path.replace(wps_dir, wps_url, 1) # when called by the WSGI app, 'WorkerExecuteResponse.__call__' will generate the XML from 'doc' property, # which itself is generated by template substitution of data from above 'json' property try: return WorkerExecuteResponse(wps_request, job_id, wps_process, job_url, settings=self.settings) except Exception as ex: # noqa LOGGER.exception( "Error building XML response by PyWPS Service during WPS Execute result from worker." ) message = f"Failed building XML response from WPS Execute result. Error [{ex!r}]" raise OWSNoApplicableCode(message, locator=job_id)
def make_result_link(result_id, result, job_id, settings): # type: (str, Union[ExecutionResultObject, ExecutionResultArray], AnyUUID, SettingsType) -> List[str] """ Convert a result definition as ``value`` into the corresponding ``reference`` for output transmission. .. seealso:: :rfc:`8288`: HTTP ``Link`` header specification. """ values = result if isinstance(result, list) else [result] suffixes = list( f".{idx}" for idx in range(len(values))) if isinstance(result, list) else [""] wps_url = get_wps_output_url(settings).strip("/") links = [] for suffix, value in zip(suffixes, values): key = get_any_value(result, key=True) if key != "href": # literal data to be converted to link # plain text file must be created containing the raw literal data typ = ContentType.TEXT_PLAIN # as per '/rec/core/process-execute-sync-document-ref' enc = "UTF-8" out = get_wps_output_dir(settings) val = get_any_value(value, data=True, file=False) loc = os.path.join(job_id, result_id + suffix + ".txt") url = f"{wps_url}/{loc}" path = os.path.join(out, loc) with open(path, mode="w", encoding=enc) as out_file: out_file.write(val) else: fmt = get_field(result, "format", default={"mediaType": ContentType.TEXT_PLAIN}) typ = get_field(fmt, "mime_type", search_variations=True, default=ContentType.TEXT_PLAIN) enc = get_field(fmt, "encoding", search_variations=True, default=None) url = get_any_value(value, data=False, file=True) # should already include full path if fmt == ContentType.TEXT_PLAIN and not enc: # only if text, otherwise binary content could differ enc = "UTF-8" # default both omit/empty encoding = f"; charset={enc}" if enc else "" links.append( f"<{url}>; rel=\"{result_id}{suffix}\"; type={typ}{encoding}") return links
def job( self, joborder, # type: Dict[Text, AnyValue] output_callbacks, # type: Callable[[Any, Any], Any] runtime_context, # type: RuntimeContext ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] """ Workflow job generator. :param joborder: inputs of the job submission :param output_callbacks: method to fetch step outputs and corresponding step details :param runtime_context: configs about execution environment :return: """ require_prefix = "" if self.metadata["cwlVersion"] == "v1.0": require_prefix = "http://commonwl.org/cwltool#" jobname = uniquename(runtime_context.name or shortname(self.tool.get("id", "job"))) # outdir must be served by the EMS because downstream step will need access to upstream steps output weaver_out_dir = get_wps_output_dir(get_settings(app)) runtime_context.outdir = tempfile.mkdtemp(prefix=getdefault( runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX), dir=weaver_out_dir) builder = self._init_job(joborder, runtime_context) # `jobname` is the step name and `joborder` is the actual step inputs wps_workflow_job = WpsWorkflowJob( builder, builder.job, self.requirements, self.hints, jobname, self.get_job_process_definition(jobname, joborder, self.tool), self.tool["outputs"]) wps_workflow_job.prov_obj = self.prov_obj wps_workflow_job.successCodes = self.tool.get("successCodes") wps_workflow_job.temporaryFailCodes = self.tool.get( "temporaryFailCodes") wps_workflow_job.permanentFailCodes = self.tool.get( "permanentFailCodes") # TODO Taken from command_line_tool.py maybe this could let us use the revmap if required at all # reffiles = copy.deepcopy(builder.files) # builder.pathmapper = self.make_path_mapper( # reffiles, builder.stagedir, runtimeContext, True) # builder.requirements = wps_workflow_job.requirements wps_workflow_job.outdir = builder.outdir wps_workflow_job.tmpdir = builder.tmpdir wps_workflow_job.stagedir = builder.stagedir readers = {} # type: Dict[Text, Any] timelimit = self.get_requirement(require_prefix + "TimeLimit")[0] if timelimit: with SourceLine(timelimit, "timelimit", validate.ValidationException): wps_workflow_job.timelimit = builder.do_eval( timelimit["timelimit"]) if not isinstance(wps_workflow_job.timelimit, int) or wps_workflow_job.timelimit < 0: raise Exception( "timelimit must be an integer >= 0, got: %s" % wps_workflow_job.timelimit) wps_workflow_job.collect_outputs = partial( self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=getdefault(runtime_context.compute_checksum, True), jobname=jobname, readers=readers) wps_workflow_job.output_callback = output_callbacks yield wps_workflow_job
def collect_statistics(process, settings=None, job=None, rss_start=None): # type: (Optional[psutil.Process], Optional[SettingsType], Optional[Job], Optional[int]) -> Optional[Statistics] """ Collect any available execution statistics and store them in the :term:`Job` if provided. """ try: mem_used = None if job: mem_info = list( filter( lambda line: "cwltool" in line and "memory used" in line, job.logs)) mem_used = None if mem_info: mem_info = mem_info[0].split(":")[-1].strip() mem_used = parse_number_with_unit(mem_info, binary=True) stats = {} # type: JSON if mem_used: stats["application"] = { # see: 'cwltool.job.JobBase.process_monitor', reported memory in logs uses 'rss' "usedMemory": apply_number_with_unit(mem_used, binary=True), "usedMemoryBytes": mem_used, } rss = None if process: proc_info = process.memory_full_info() rss = getattr(proc_info, "rss", 0) uss = getattr(proc_info, "uss", 0) vms = getattr(proc_info, "vms", 0) stats["process"] = { "rss": apply_number_with_unit(rss, binary=True), "rssBytes": rss, "uss": apply_number_with_unit(uss, binary=True), "ussBytes": uss, "vms": apply_number_with_unit(vms, binary=True), "vmsBytes": vms, } fields = [("usedThreads", "num_threads"), ("usedCPU", "cpu_num"), ("usedHandles", "num_handles")] for field, method in fields: func = getattr(process, method, None) stats["process"][field] = func() if func is not None else 0 if rss_start and rss: # diff of RSS between start/end to consider only execution of the job steps # this more accurately reports used memory by the execution itself, omitting celery worker's base memory rss_diff = rss - rss_start stats["process"]["usedMemory"] = apply_number_with_unit( rss_diff, binary=True) stats["process"]["usedMemoryBytes"] = rss_diff total_size = 0 if job: stats["outputs"] = {} for result in job.results: res_ref = get_any_value(result, file=True) if res_ref and isinstance(res_ref, str): if res_ref.startswith( f"/{job.id}"): # pseudo-relative reference out_dir = get_wps_output_dir(settings) res_ref = os.path.join(out_dir, res_ref[1:]) if os.path.isfile(res_ref): res_stat = os.stat(res_ref) res_id = get_any_id(result) res_size = res_stat.st_size stats["outputs"][res_id] = { "size": apply_number_with_unit(res_size, binary=True), "sizeBytes": res_size, } total_size += res_size stats["process"]["totalSize"] = apply_number_with_unit(total_size, binary=True) stats["process"]["totalSizeBytes"] = total_size if stats and job: job.statistics = stats return stats or None except Exception as exc: # pragma: no cover LOGGER.warning( "Ignoring error that occurred during statistics collection [%s]", str(exc), exc_info=exc)