def test_map_wps_output_location_duplicate_subdir(): for tmp_dir in ["/tmp/tmp/tmp", "/tmp/tmpdir"]: wps_out = "http:///localhost/wps-output/tmp" settings = { "weaver.wps_output_dir": tmp_dir, "weaver.wps_output_url": wps_out } path = map_wps_output_location(f"{wps_out}/tmp/some-file-tmp.tmp", settings, exists=False) assert path == f"{tmp_dir}/tmp/some-file-tmp.tmp" path = map_wps_output_location(f"{tmp_dir}/here/some-file-tmp.tmp", settings, exists=False, reverse=True) assert path == f"{wps_out}/here/some-file-tmp.tmp"
def stage_results(self, results, expected_outputs, out_dir): # type: (JobResults, CWL_ExpectedOutputs, str) -> None """ Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory. This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`. .. note:: The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``, but this definition could be a glob pattern to match multiple file and/or nested directories. We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``). """ for result in results: res_id = get_any_id(result) if res_id not in expected_outputs: continue # plan ahead when list of multiple output values could be supported result_values = get_any_value(result) if not isinstance(result_values, list): result_values = [result_values] cwl_out_dir = out_dir.rstrip("/") for value in result_values: src_name = value.split("/")[-1] dst_path = "/".join([cwl_out_dir, src_name]) # performance improvement: # Bypass download if file can be resolved as local resource (already fetched or same server). # Because CWL expects the file to be in specified 'out_dir', make a link for it to be found # even though the file is stored in the full job output location instead (already staged by step). map_path = map_wps_output_location(value, self.settings) as_link = False if map_path: LOGGER.info( "Detected result [%s] from [%s] as local reference to this instance. " "Skipping fetch and using local copy in output destination: [%s]", res_id, value, dst_path) LOGGER.debug("Mapped result [%s] to local reference: [%s]", value, map_path) src_path = map_path as_link = True else: LOGGER.info( "Fetching result [%s] from [%s] to CWL output destination: [%s]", res_id, value, dst_path) src_path = value fetch_file(src_path, cwl_out_dir, settings=self.settings, link=as_link)
def test_map_wps_output_location_exists(): wps_url = "http:///localhost/wps-output/tmp" wps_dir = "/tmp/weaver-test/test-outputs" settings = { "weaver.wps_output_dir": wps_dir, "weaver.wps_output_url": wps_url } try: os.makedirs(wps_dir, exist_ok=True) with tempfile.NamedTemporaryFile(dir=wps_dir, suffix="test.txt", mode="w") as tmp_file: tmp_file.write("fake data") tmp_file.flush() tmp_file.seek(0) tmp_path = tmp_file.name tmp_name = os.path.split(tmp_file.name)[-1] tmp_http = f"{wps_url}/{tmp_name}" assert os.path.isfile(tmp_path), "failed setup test file" path = map_wps_output_location(tmp_http, settings, exists=True) assert path == tmp_path path = map_wps_output_location(tmp_http, settings, exists=False) assert path == tmp_path path = map_wps_output_location(tmp_path, settings, exists=True, reverse=True) assert path == tmp_http path = map_wps_output_location(tmp_path, settings, exists=False, reverse=True) assert path == tmp_http assert not os.path.isfile( tmp_path), "test file expected to be auto-cleaned" path = map_wps_output_location(tmp_http, settings, exists=True) assert path is None path = map_wps_output_location(tmp_http, settings, exists=False) assert path == tmp_path path = map_wps_output_location(tmp_path, settings, exists=True, reverse=True) assert path is None path = map_wps_output_location(tmp_path, settings, exists=False, reverse=True) assert path == tmp_http except AssertionError: raise except Exception as exc: pytest.fail(f"Failed due to unexpected exception: [{exc}]") finally: shutil.rmtree(wps_dir, ignore_errors=True)
def stage_results(self, results, expected_outputs, out_dir): # type: (JobResults, CWL_ExpectedOutputs, str) -> None """ Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory. This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`. .. note:: The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``, but this definition could be a glob pattern to match multiple file and/or nested directories. We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``). .. seealso:: Function :func:`weaver.processes.convert.any2cwl_io` defines a generic glob pattern using the output ID and expected file extension based on Content-Type format. Since the remote :term:`WPS` :term:`Process` doesn't necessarily produces file names with the output ID as expected to find them (could be anything), staging must patch locations to let :term:`CWL` runtime resolve the files according to glob definitions. .. warning:: Only remote :term:`Provider` implementations (which auto-generate a pseudo :term:`CWL` to map components) that produce outputs with inconsistent file names as described above should set attribute :attr:`WpsProcessInterface.stage_output_id_nested` accordingly. For :term:`Process` that directly provide an actual :term:`CWL` :term:`Application Package` definition (e.g.: Docker application), auto-mapping of glob patterns should be avoided, as it is expected that the :term:`CWL` contains real mapping to be respected for correct execution and retrieval of outputs from the application. """ for result in results: res_id = get_any_id(result) if res_id not in expected_outputs: continue # plan ahead when list of multiple output values could be supported result_values = get_any_value(result) if not isinstance(result_values, list): result_values = [result_values] if self.stage_output_id_nested: cwl_out_dir = "/".join([out_dir.rstrip("/"), res_id]) else: cwl_out_dir = out_dir.rstrip("/") os.makedirs(cwl_out_dir, mode=0o700, exist_ok=True) for value in result_values: src_name = value.split("/")[-1] dst_path = "/".join([cwl_out_dir, src_name]) # performance improvement: # Bypass download if file can be resolved as local resource (already fetched or same server). # Because CWL expects the file to be in specified 'out_dir', make a link for it to be found # even though the file is stored in the full job output location instead (already staged by step). map_path = map_wps_output_location(value, self.settings) as_link = False if map_path: LOGGER.info( "Detected result [%s] from [%s] as local reference to this instance. " "Skipping fetch and using local copy in output destination: [%s]", res_id, value, dst_path) LOGGER.debug("Mapped result [%s] to local reference: [%s]", value, map_path) src_path = map_path as_link = True else: LOGGER.info( "Fetching result [%s] from [%s] to CWL output destination: [%s]", res_id, value, dst_path) src_path = value fetch_file(src_path, cwl_out_dir, settings=self.settings, link=as_link)
def get_job_results_response(job, container, headers=None): # type: (Job, AnySettingsContainer, Optional[AnyHeadersContainer]) -> AnyResponseType """ Generates the :term:`OGC` compliant :term:`Job` results response according to submitted execution parameters. Parameters that impact the format of the response are: - Amount of outputs to be returned. - Parameter ``response: raw|document`` - Parameter ``transmissionMode: value|reference`` per output if ``response: raw``. .. seealso:: More details available for each combination: - https://docs.ogc.org/is/18-062r2/18-062r2.html#sc_execute_response - https://docs.ogc.org/is/18-062r2/18-062r2.html#_response_7 :param job: Job for which to generate the results response. :param container: Application settings. :param headers: Additional headers to provide in the response. """ raise_job_dismissed(job, container) raise_job_bad_status(job, container) # when 'response=document', ignore 'transmissionMode=value|reference', respect it when 'response=raw' # See: # - https://docs.ogc.org/is/18-062r2/18-062r2.html#_response_7 (/req/core/job-results-async-document) # - https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-document is_raw = job.execution_response == ExecuteResponse.RAW results, refs = get_results( job, container, value_key="value", schema=JobInputsOutputsSchema. OGC, # not strict to provide more format details link_references=is_raw) headers = headers or {} if "location" not in headers: headers["Location"] = job.status_url(container) if not is_raw: # note: # Cannot add "links" field in response body because variable Output ID keys are directly at the root # Possible conflict with an output that would be named "links". results = sd.Result().deserialize(results) return HTTPOk(json=results, headers=headers) if not results: # avoid schema validation error if all by reference # Status code 204 for empty body # see: # - https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-ref refs.extend(headers.items()) return HTTPNoContent(headers=refs) # raw response can be data-only value, link-only or a mix of them if results: # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-value-one out_vals = list(results.items( )) # type: List[Tuple[str, ExecutionResultValue]] # noqa out_info = out_vals[0][-1] # type: ExecutionResultValue out_type = get_any_value(out_info, key=True) out_data = get_any_value(out_info) # FIXME: https://github.com/crim-ca/weaver/issues/376 # implement multipart, both for multi-output IDs and array-output under same ID if len(results) > 1 or (isinstance(out_data, list) and len(out_data) > 1): # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-value-multi raise HTTPNotImplemented( json={ "code": "NotImplemented", "type": "NotImplemented", "detail": "Multipart results with 'transmissionMode=value' and 'response=raw' not implemented.", }) # single value only out_data = out_data[0] if isinstance(out_data, list) else out_data if out_type == "href": out_path = map_wps_output_location(out_data, container, exists=True, url=False) out_type = out_info.get("type") # noqa out_headers = get_file_headers(out_path, download_headers=True, content_headers=True, content_type=out_type) resp = FileResponse(out_path) resp.headers.update(out_headers) resp.headers.update(headers) else: resp = HTTPOk(body=out_data, charset="UTF-8", content_type=ContentType.TEXT_PLAIN, headers=headers) else: resp = HTTPOk(headers=headers) if refs: # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-ref # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-mixed-multi resp.headerlist.extend(refs) return resp