def _get_axis_order(self, identifier: str): """ Returns the axis order for a given spatial result system Args: identifier: Returns: """ id = self.get_real_identifier(identifier) axis_order = self.cacher.get(str(id)) if axis_order is not None: axis_order = json.loads(axis_order) return axis_order XML_NAMESPACES["gml"] = "http://www.opengis.net/gml/3.2" uri = self.registry_uri + self.id_prefix + str(id) response = requests.request("Get", url=uri, proxies=PROXIES) response = xml_helper.parse_xml(str(response.content.decode())) type = xml_helper.try_get_text_from_xml_element(xml_elem=response, elem="//epsg:type") if type == "projected": cartes_elem = xml_helper.try_get_single_element_from_xml( "//gml:cartesianCS", response) second_level_srs_uri = xml_helper.get_href_attribute( xml_elem=cartes_elem) elif type == "geographic 2D": geogr_elem = xml_helper.try_get_single_element_from_xml( "//gml:ellipsoidalCS", response) second_level_srs_uri = xml_helper.get_href_attribute( xml_elem=geogr_elem) else: second_level_srs_uri = "" uri = self.registry_uri + second_level_srs_uri response = requests.request("Get", url=uri, proxies=PROXIES) response = xml_helper.parse_xml(str(response.content.decode())) axis = xml_helper.try_get_element_from_xml("//gml:axisDirection", response) order = [] for a in axis: order.append(a.text) order = { "first_axis": order[0], "second_axis": order[1], } # Write this to cache, so it can be used on another request! self.cacher.set(str(id), json.dumps(order)) return order
def _get_axis_order(self, identifier: str): """ Returns the axis order for a given spatial result system Args: identifier: Returns: """ id = self.get_real_identifier(identifier) axis_order = self.cacher.get(str(id)) if axis_order is not None: axis_order = json.loads(axis_order) return axis_order XML_NAMESPACES["gml"] = "http://www.opengis.net/gml/3.2" XML_NAMESPACES["epsg"] = "urn:x-ogp:spec:schema-xsd:EPSG:2.2:dataset" uri = self.registry_uri.replace("{CRS_IDENTIFIER}", str(id)) # change header headers = {'Accept': 'application/xml'} response = requests.request("Get", url=uri, proxies=PROXIES, headers=headers) response = xml_helper.parse_xml(str(response.content.decode())) type = xml_helper.try_get_text_from_xml_element(xml_elem=response, elem="//epsg:type") if type == "projected": cartes_elem = xml_helper.try_get_single_element_from_xml("//gml:cartesianCS", response) second_level_srs_uri = xml_helper.get_href_attribute(xml_elem=cartes_elem) elif type in ["geographic 2D", "geographic 2d"]: geogr_elem = xml_helper.try_get_single_element_from_xml("//gml:ellipsoidalCS", response) second_level_srs_uri = xml_helper.get_href_attribute(xml_elem=geogr_elem) else: second_level_srs_uri = "" uri = second_level_srs_uri headers = {'Accept': 'application/xml'} response = requests.request("Get", url=uri, proxies=PROXIES, headers=headers) response = xml_helper.parse_xml(str(response.content.decode())) axis = xml_helper.try_get_element_from_xml("//gml:axisDirection", response) order = [] for a in axis: order.append(a.text) order = { "first_axis": order[0], "second_axis": order[1], } # Write this to cache, so it can be used on another request! self.cacher.set(str(id), json.dumps(order)) return order
def create_from_capabilities(self, metadata_only: bool = False, async_task: Task = None, external_auth: ExternalAuthentication = None): """ Fills the object with data from the capabilities document Returns: nothing """ # get xml as iterable object xml_obj = xml_helper.parse_xml(xml=self.service_capabilities_xml) start_time = time.time() self.get_service_metadata_from_capabilities(xml_obj=xml_obj, async_task=async_task) # check if 'real' service metadata exist service_metadata_uri = xml_helper.try_get_text_from_xml_element(xml_elem=xml_obj, elem="//VendorSpecificCapabilities/inspire_vs:ExtendedCapabilities/inspire_common:MetadataUrl/inspire_common:URL") if service_metadata_uri is not None: self.get_service_metadata(uri=service_metadata_uri, async_task=async_task) service_logger.debug(EXEC_TIME_PRINT % ("service metadata", time.time() - start_time)) # check possible operations on this service start_time = time.time() self.get_service_operations_and_formats(xml_obj) service_logger.debug(EXEC_TIME_PRINT % ("service operation checking", time.time() - start_time)) # parse possible linked dataset metadata start_time = time.time() self.get_service_dataset_metadata(xml_obj=xml_obj) service_logger.debug(EXEC_TIME_PRINT % ("service iso metadata", time.time() - start_time)) self.get_version_specific_metadata(xml_obj=xml_obj) if not metadata_only: start_time = time.time() self._parse_layers(xml_obj=xml_obj, async_task=async_task) service_logger.debug(EXEC_TIME_PRINT % ("layer metadata", time.time() - start_time))
def transform_constraint_to_cql(constraint: str, constraint_language: str): """ Transforms a xml filter style constraint into CQL style Args: constraint (str): The constraint parameter constraint_language (str): The constraintlanguage parameter Returns: constraint (str): The transfored constrained """ if constraint_language.upper() != "FILTER": raise ValueError( "{} is no valid CSW conform value. Choices are `CQL_TEXT, FILTER`". format(constraint_language), "constraintlanguage") constraint_xml = xml_helper.parse_xml(constraint) if constraint_xml is None: raise ValueError( "Constraint value is no valid xml! Did you set the correct value for 'constraintlanguage'?", CONSTRAINT_LOCATOR) filter_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("Filter"), constraint_xml.getroot()) new_constraint = _transform_constraint_to_cql_recursive(filter_elem) return new_constraint
def test_get_records_sort(self): """ Test whether the sorting parameter is working properly Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecords", "elementsetname": "brief", "resulttype": "results", "sortby": "dc:title:D", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG) # Iterate over dc:title objects and check whether they are sorted correctly! title_elems = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("title"), content_xml) titles = [ xml_helper.try_get_text_from_xml_element(title_elem) for title_elem in title_elems ] titles_sorted = copy(titles) titles.sort(reverse=True) # Check the descending sorted way self.assertEqual(titles, titles_sorted)
def test_exception_report(self): """ Test for checking if the ows:ExceptionReport is working fine or not. Test by requesting a wrong operation Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "WRONG_OPERATION", "id": self.test_id, "elementsetname": "brief", "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG) exception_report_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("ExceptionReport"), content_xml) self.assertIsNotNone(exception_report_elem, "No ows:ExceptionReport was generated!")
def create_metadata_elem(self, returned_md: Metadata): """ Returns existing service/dataset metadata as xml elements Args: returned_md (Metadata): The processing metadata Returns: xml (Element): The xml element """ if returned_md.is_dataset_metadata: doc = Document.objects.get( metadata=returned_md, document_type=DocumentEnum.METADATA.value, ) xml = doc.content else: xml = returned_md.get_service_metadata_xml() xml = xml_helper.parse_xml(xml) xml = xml_helper.try_get_single_element_from_xml( xml_elem=xml, elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_Metadata")) # Reduce the amount of information returned based on the requested elementSetName parameter xml = self.reduce_information(xml) return xml
def _parse_parameters(self, params_dict: dict): """ Parses a parameter dictionary into the object Args: params_dict (dict): The parameter key-value dict Returns: """ # Parse all parameters automatically by resolving the parameter_map for key, val in params_dict.items(): key_lower = key.lower() param = self.parameter_map.get(key_lower, None) if not param: continue # Make sure no negative integers are passed try: val = int(val) if val < 0: raise AssertionError("No negative values allowed!") except ValueError: pass setattr(self, param, val) # Transform listable parameters into lists listable_elements = ["element_name", "namespace"] for elem in listable_elements: attribute = getattr(self, elem) if isinstance(attribute, str): attribute = attribute.split(",") setattr(self, elem, attribute) # Check if range of values is acceptable if self.result_type not in RESULT_TYPE_CHOICES: raise ValueError(INVALID_PARAMETER_TEMPLATE.format(self.result_type, ", ".join(RESULT_TYPE_CHOICES)), "resultType") if self.element_set_name is not None and len(self.element_name) > 0: raise ValueError("Parameter 'ElementSetName' and 'ElementName' are mutually exclusive. You can only provide one!", "elementSetName") elif self.element_set_name and self.element_set_name not in ELEMENT_SET_CHOICES: raise ValueError(INVALID_PARAMETER_TEMPLATE.format(self.element_set_name, ", ".join(ELEMENT_SET_CHOICES)), "elementSetName") elif self.element_set_name is None and len(self.element_name) == 0: self.element_set_name = "full" # default if self.version not in VERSION_CHOICES: raise ValueError(INVALID_PARAMETER_TEMPLATE.format(self.version, ", ".join(VERSION_CHOICES)), "version") # Check if constraint has to be transformed first! if self.constraint_language is not None and self.constraint_language.upper() != "CQL_TEXT": try: self.constraint = transform_constraint_to_cql(self.constraint, self.constraint_language) self.constraint_language = "CQL_TEXT" except TypeError: raise ValueError("XML does not seem to be valid. Please check the CSW specification.", CONSTRAINT_LOCATOR) elif self.constraint is not None: xml_elem = xml_helper.parse_xml(self.constraint) if xml_elem is not None: raise ValueError("XML found for constraint parameter but CQL_TEXT found for constraintlanguage. Please set your parameters correctly.", CONSTRAINT_LOCATOR)
def test_get_records_by_id(self): """ Test for checking if the GetRecordsById is working fine or not. Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecordById", "id": self.test_id, "elementsetname": "full", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG) # Check that the results are correct in amount and quality num_returned_elems = int( xml_helper.try_get_attribute_from_xml_element( xml_elem=content_xml, attribute="numberOfRecordsMatched", elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"))) self.assertEqual( num_returned_elems, 1, "More than one element returned on GetRecordsById with only one used identifier!" ) real_returned_elems = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("Record"), content_xml) num_real_returned_elems = len(real_returned_elems) self.assertEqual( num_real_returned_elems, num_returned_elems, "csw:SearchResults contains wrong numberOfRecordsMatched! {} stated but {} returned!" .format(num_returned_elems, num_real_returned_elems)) identifiers = [ xml_helper.try_get_text_from_xml_element( real_returned_elem, "//" + GENERIC_NAMESPACE_TEMPLATE.format("identifier")) for real_returned_elem in real_returned_elems ] identifiers_identical = [ identifier == self.test_id for identifier in identifiers ] self.assertTrue( False not in identifiers_identical, "Elements with not matching identifier has been returned: {}". format(", ".join(identifiers)))
def check_status(self, url: str, check_wfs_member: bool = False, check_image: bool = False) -> ServiceStatus: """ Check status of ogc service. Args: url (str): URL to the service that should be checked. check_wfs_member (bool): True, if a returned xml should check for a 'member' tag. check_image (bool): True, if the returned content should be checked as image. Returns: ServiceStatus: Status info of service. """ success = False duration = None connector = CommonConnector(url=url, timeout=self.monitoring_settings.timeout if self.monitoring_settings is not None else MONITORING_REQUEST_TIMEOUT) if self.metadata.has_external_authentication: connector.external_auth = self.metadata.external_authentication try: connector.load() except Exception as e: # handler if server sends no response (e.g. outdated uri) response_text = str(e) return Monitoring.ServiceStatus(url, success, response_text, connector.status_code, duration) duration = timezone.timedelta(seconds=connector.run_time) response_text = connector.content if connector.status_code == 200: success = True try: xml = parse_xml(response_text) if 'Exception' in xml.getroot().tag: success = False if check_wfs_member: if not self.has_wfs_member(xml): success = False except AttributeError: # handle successful responses that do not return xml response_text = None if check_image: try: Image.open(BytesIO(connector.content)) success = True except UnidentifiedImageError: success = False service_status = Monitoring.ServiceStatus(url, success, response_text, connector.status_code, duration) return service_status
def test_new_service_check_layer_num(self): return """ Tests whether all layer objects from the xml have been stored inside the service object Returns: """ service = self.service_wms layers = service.get_subelements() cap_xml = xml_helper.parse_xml(self.cap_doc_wms.content) num_layers_xml = self._get_num_of_layers(cap_xml) num_layers_service = len(layers) self.assertEqual(num_layers_service, num_layers_xml)
def create_from_capabilities(self, metadata_only: bool = False, external_auth: ExternalAuthentication = None): """ Load data from capabilities document Args: metadata_only (bool): Whether only metadata shall be fetched async_task (Task): The asynchronous running task Returns: """ # get xml as iterable object xml_obj = xml_helper.parse_xml(xml=self.service_capabilities_xml) # parse service metadata self.get_service_metadata_from_capabilities(xml_obj) # Parse <OperationsMetadata> self.get_service_operations_and_formats(xml_obj)
def _build_lock_feature_xml(self, service_param: str, version_param: str, request_param: str): """ Returns the POST request XML for a Lock request Args: service_param (str): The service param version_param (str): The version param request_param (str): The request param Returns: xml (str): The xml document """ xml = "" lock_action_param = self._get_POST_val("lockAction") or "" type_name_param = self._get_POST_val("typename") filter_param = self._get_POST_val("filter") reduced_ns_map = self._get_version_specific_namespaces( version_param, service_param) root_attributes = { "service": service_param, "version": version_param, "lockAction": lock_action_param } root = etree.Element(_tag=request_param, nsmap=reduced_ns_map, attrib=root_attributes) # create the xml filter object from the filter string parameter filter_xml = xml_helper.parse_xml(filter_param) filter_xml_root = filter_xml.getroot() for t_n_param in type_name_param.split(","): query_attributes = {"typeName": t_n_param} query_elem = xml_helper.create_subelement(root, "Query", attrib=query_attributes) # add the filter xml object as subobject to the query to use e.g. the spatial restriction xml_helper.add_subelement(query_elem, filter_xml_root) xml = xml_helper.xml_to_string(root) return xml
def test_new_service_check_describing_attributes(self): return """ Tests whether the describing attributes, such as title or abstract, are correct. Checks for the service. Checks for each layer. Returns: """ service = self.service_wms layers = service.get_subelements() cap_xml = xml_helper.parse_xml(self.cap_doc_wms.content) xml_title = xml_helper.try_get_text_from_xml_element( cap_xml, "//Service/Title") xml_abstract = xml_helper.try_get_text_from_xml_element( cap_xml, "//Service/Abstract") self.assertEqual(service.metadata.title, xml_title) self.assertEqual(service.metadata.abstract, xml_abstract) # run for layers for layer in layers: xml_layer = xml_helper.try_get_single_element_from_xml( "//Name[text()='{}']/parent::Layer".format(layer.identifier), cap_xml) if xml_layer is None: # this might happen for layers which do not provide a unique identifier. We generate an identifier automatically in this case. # this generated identifier - of course - can not be found in the xml document. continue xml_title = xml_helper.try_get_text_from_xml_element( xml_layer, "./Title") xml_abstract = xml_helper.try_get_text_from_xml_element( xml_layer, "./Abstract") self.assertEqual( layer.metadata.title, xml_title, msg="Failed for layer with identifier '{}' and title '{}'". format(layer.identifier, layer.metadata.title)) self.assertEqual( layer.metadata.abstract, xml_abstract, msg="Failed for layer with identifier '{}' and title '{}'". format(layer.identifier, layer.metadata.title))
def get_layer_by_identifier(self, identifier: str): """ Returns the layer identified by the parameter 'identifier' as OGCWebMapServiceLayer object Args: identifier (str): The identifier as string Returns: layer_obj (OGCWebMapServiceLayer): The found and parsed layer """ if self.service_capabilities_xml is None: # load xml, might have been forgotten self.get_capabilities() layer_xml = xml_helper.parse_xml(xml=self.service_capabilities_xml) layer_xml = xml_helper.try_get_element_from_xml(xml_elem=layer_xml, elem="//Layer/Name[text()='{}']/parent::Layer".format(identifier)) if len(layer_xml) > 0: layer_xml = layer_xml[0] else: return None return self._start_single_layer_parsing(layer_xml)
def get_capabilities(self): """ Start a network call to retrieve the original capabilities xml document. Using the connector class, this function will GET the capabilities xml document as string. No file will be downloaded and stored on the storage. The string will be stored in the OGCWebService instance. Returns: nothing """ params = { "request": OGCOperationEnum.GET_CAPABILITIES.value, "version": self.service_version.value if self.service_version is not None else "", "service": (self.service_type.value if self.service_type is not None else "").upper(), } concat = "&" if self.service_connect_url[-1] != "&" else "" self.service_connect_url = "{}{}{}".format(self.service_connect_url, concat, urlencode(params)) ows_connector = CommonConnector( url=self.service_connect_url, external_auth=self.external_authentification, connection_type=ConnectionEnum.REQUESTS) ows_connector.http_method = 'GET' try: ows_connector.load() if ows_connector.status_code != 200: raise ConnectionError(ows_connector.status_code) except ReadTimeout: raise ConnectionError( CONNECTION_TIMEOUT.format(self.service_connect_url)) tmp = ows_connector.content.decode("UTF-8") # check if tmp really contains an xml file xml = xml_helper.parse_xml(tmp) if xml is None: raise Exception(tmp) self.service_capabilities_xml = tmp self.connect_duration = ows_connector.run_time self.descriptive_document_encoding = ows_connector.encoding
def _remove_iso_metadata(metadata: Metadata, md_links: list, existing_iso_links: list): """ Remove iso metadata that is not found in the newer md_links list but still lives in the persisted existing_iso_links list Args: metadata (Metadata): The edited metadata md_links (list): The new iso metadata links existing_iso_links (list): The existing metadata links, related to the metadata object Returns: nothing """ # remove iso metadata from capabilities document rel_md = metadata service_type = metadata.service_type if not metadata.is_root(): if service_type == OGCServiceEnum.WMS: rel_md = metadata.service.parent_service.metadata elif service_type == OGCServiceEnum.WFS: rel_md = metadata.featuretype.parent_service.metadata cap_doc = Document.objects.get( metadata=rel_md, is_original=False, document_type=DocumentEnum.CAPABILITY.value, ) cap_doc_txt = cap_doc.content xml_cap_obj = xml_helper.parse_xml(cap_doc_txt).getroot() # if there are links in existing_iso_links that do not show up in md_links -> remove them for link in existing_iso_links: if link not in md_links: missing_md = metadata.get_related_metadatas( filters={'to_metadatas__to_metadata__metadata_url': link}) missing_md.delete() # remove from capabilities xml_iso_element = xml_helper.find_element_where_attr( xml_cap_obj, "xlink:href", link) for elem in xml_iso_element: xml_helper.remove_element(elem) cap_doc_txt = xml_helper.xml_to_string(xml_cap_obj) cap_doc.content = cap_doc_txt cap_doc.save()
def test_get_records(self): """ Test whether the GetRecords operation runs properly Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecords", "elementsetname": "brief", "resulttype": "results", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG)
def test_get_records_constraint(self): """ Test whether the constraint parameter is working properly Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecords", "elementsetname": "brief", "resulttype": "results", "constraint": "dc:identifier like %{}%".format(self.test_id), "constraintlanguage": "CQL_TEXT", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG) # Iterate over dc:title objects and check whether they are sorted correctly! identifier_elems = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("identifier"), content_xml) identifier = [ xml_helper.try_get_text_from_xml_element(id_elem) for id_elem in identifier_elems ] identifier_inside = [self.test_id in id_elem for id_elem in identifier] self.assertTrue( False not in identifier_inside, "A result was returned, which does not fit to the given constraint parameter!" )
def check_uri_provides_ogc_capabilities(value) -> ValidationError: """ Checks whether a proper XML OGC Capabilities document can be found at the given url. Args: value: The url parameter Returns: None|ValidationError: None if the checks are valid, ValidationError else """ connector = CommonConnector(url=value) connector.load() if connector.status_code == 401: # This means the resource needs authentication to be called. At this point we can not check whether this is # a proper OGC capabilities or not. Skip this check. return None try: xml_response = xml_helper.parse_xml(connector.content) root_elem = xml_response.getroot() tag_text = root_elem.tag if "Capabilities" not in tag_text: return ValidationError(_("This is no capabilities document.")) except AttributeError: # No xml found! return ValidationError(_("No XML found."))
def test_get_records_md_metadata(self): """ Test for checking if the GetRecordsById is working fine or not. Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecordsById", "id": self.test_id, "elementsetname": "brief", "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG)
def harvest(self): """ Starts harvesting procedure Returns: """ absolute_url = f'<a href="{self.metadata.get_absolute_url()}">{self.metadata.title}</a>' service_json = {'id': self.metadata.pk, 'absolute_url': absolute_url}, if current_task: current_task.update_state(state=states.STARTED, meta={ 'service': service_json, 'phase': f"Connecting to {absolute_url}", }) # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore. all_persisted_metadata_identifiers = self.metadata.get_related_metadatas( filters={ 'to_metadatas__relation_type': MetadataRelationEnum.HARVESTED_THROUGH.value }).values_list("identifier", flat=True) # Use a set instead of list to increase lookup afterwards self.deleted_metadata.update(all_persisted_metadata_identifiers) # Perform the initial "hits" request to get an overview of how many data will be fetched hits_response, status_code = self._get_harvest_response( result_type="hits") if status_code != 200: raise ConnectionError( _("Harvest failed: Code {}\n{}").format( status_code, hits_response)) xml_response = xml_helper.parse_xml(hits_response) if xml_response is None: raise ConnectionError( _("Response is not a valid xml: \n{}".format(hits_response))) try: if current_task: current_task.update_state(state=states.STARTED, meta={ 'phase': f"calculating harvesting time", }) total_number_to_harvest = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "numberOfRecordsMatched", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) except TypeError: csw_logger.error( "Malicious Harvest response: {}".format(hits_response)) raise AttributeError( _("Harvest response is missing important data!")) if current_task: current_task.update_state(state=states.STARTED, meta={ 'service': service_json, 'phase': "Start harvesting..." }) self.progress_step_per_result = float( 1 / total_number_to_harvest) * 100 # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and # simply end it there! processed_start_positions = set() t_start = time() number_rest_to_harvest = total_number_to_harvest number_of_harvested = 0 self.harvest_result.timestamp_start = timezone.now() self.harvest_result.save() page_cacher = PageCacher() # Run as long as we can fetch data and as long as the user does not abort the pending task! while True: estimated_time_for_all = 'unknown' if current_task: current_task.update_state( state=states.STARTED, meta={ 'phase': _("Harvesting first {} of {}. Time remaining: {}"). format(self.max_records_per_request, total_number_to_harvest, estimated_time_for_all), }) processed_start_positions.add(self.start_position) # Get response next_response, status_code = self._get_harvest_response( result_type="results") if current_task: current_task.update_state( state=states.STARTED, meta={ 'phase': _("Processing harvested results for the first {} of {}. Time remaining: {}" ).format(self.max_records_per_request, total_number_to_harvest, estimated_time_for_all), }) found_entries = self._process_harvest_response(next_response) # Calculate time since loop started duration = time() - t_start number_rest_to_harvest -= self.max_records_per_request number_of_harvested += found_entries self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX) if self.start_position == 0 or self.start_position in processed_start_positions: # We are done! break else: seconds_for_rest = (number_rest_to_harvest * (duration / number_of_harvested)) estimated_time_for_all = timezone.timedelta( seconds=seconds_for_rest) # Add HarvestResult infos self.harvest_result.timestamp_end = timezone.now() self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Delete Metadata records which could not be found in the catalogue anymore # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest! deleted_metadatas = Metadata.objects.filter( identifier__in=self.deleted_metadata) deleted_metadatas.delete() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX)
def overwrite_capabilities_document(metadata: Metadata): """ Overwrites the capabilities document which is related to the provided metadata. If a subelement of a service has been edited, the service root capabilities will be changed since this is the most requested document of the service. All subelements capabilities documents above the edited element will be reset to None and cached documents will be cleared. This forces an automatic creation of the correct capabilities on the next request for these elements, which will result in correct information about the edited subelement. Args: metadata (Metadata): Returns: nothing """ is_root = metadata.is_root() if is_root: parent_metadata = metadata elif metadata.is_metadata_type(MetadataEnum.LAYER): parent_metadata = metadata.service.parent_service.metadata elif metadata.is_metadata_type(MetadataEnum.FEATURETYPE): parent_metadata = metadata.featuretype.parent_service.metadata # Make sure the Document record already exist by fetching the current capability xml # This is a little trick to auto-generate Document records which did not exist before! parent_metadata.get_current_capability_xml( parent_metadata.get_service_version().value) cap_doc = Document.objects.get( metadata=parent_metadata, document_type=DocumentEnum.CAPABILITY.value, is_original=False, ) # overwrite all editable data xml_obj_root = xml_helper.parse_xml(cap_doc.content) # find matching xml element in xml doc _type = metadata.service_type.value _version = metadata.get_service_version() identifier = metadata.identifier if is_root: if metadata.is_service_type(OGCServiceEnum.WFS): if _version is OGCServiceVersionEnum.V_2_0_0 or _version is OGCServiceVersionEnum.V_2_0_2: XML_NAMESPACES["wfs"] = "http://www.opengis.net/wfs/2.0" XML_NAMESPACES["ows"] = "http://www.opengis.net/ows/1.1" XML_NAMESPACES["fes"] = "http://www.opengis.net/fes/2.0" XML_NAMESPACES["default"] = XML_NAMESPACES["wfs"] identifier = metadata.title xml_obj = xml_helper.find_element_where_text(xml_obj_root, txt=identifier) if len(xml_obj) > 0: xml_obj = xml_obj[0] # handle keywords _overwrite_capabilities_keywords(xml_obj, metadata, _type) # handle iso metadata links _overwrite_capabilities_iso_metadata_links(xml_obj, metadata) # overwrite data _overwrite_capabilities_data(xml_obj, metadata) # write xml back to Document record # Remove service_metadata_document as well, so it needs to be generated again! xml = xml_helper.xml_to_string(xml_obj_root) cap_doc.content = xml cap_doc.save() service_metadata_doc = Document.objects.filter( metadata=metadata, document_type=DocumentEnum.METADATA.value, ) service_metadata_doc.delete() # Delete all cached documents, which holds old state! metadata.clear_cached_documents() # Delete all cached documents of root service, which holds old state! parent_metadata.clear_cached_documents() # Remove existing document contents from upper elements (children of root element), which holds old state! metadata.clear_upper_element_capabilities(clear_self_too=True)
def _process_harvest_response(self, next_response: bytes) -> int: """ Processes the harvest response content While the last response is being processed, the next one is already loaded to decrease run time Args: response (bytes): The response as bytes Returns: number_found_entries (int): The amount of found metadata records in this response """ xml_response = xml_helper.parse_xml(next_response) if xml_response is None: csw_logger.error( "Response is no valid xml. catalogue: {}, startPosition: {}, maxRecords: {}" .format(self.metadata.title, self.start_position, self.max_records_per_request)) # Abort! self.start_position = 0 return md_metadata_entries = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_Metadata"), xml_response) or [] next_record_position = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "nextRecord", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) self.start_position = next_record_position # Fetch found identifiers in parent process, so self.deleted_metadata can be edited easily for md_identifier in md_metadata_entries: id = xml_helper.try_get_text_from_xml_element( md_identifier, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) try: self.deleted_metadata.remove(id) except KeyError: pass # Delete response to free memory del xml_response # Process response via multiple processes t_start = time() num_processes = int(cpu_count() / 2) num_processes = num_processes if num_processes >= 1 else 1 index_step = int(len(md_metadata_entries) / num_processes) start_index = 0 end_index = 0 self.resource_list = md_metadata_entries process_list = [] for i in range(0, num_processes): if index_step < 1: end_index = -1 else: end_index += index_step p = Process(target=self._create_metadata_from_md_metadata, args=(start_index, end_index)) start_index += index_step process_list.append(p) # Close all connections to force each process to create a new one for itself connections.close_all() execute_threads(process_list) csw_logger.debug( "Harvesting '{}': runtime for {} metadata parsing: {}s ####". format(self.metadata.title, self.max_records_per_request, time() - t_start)) return len(md_metadata_entries)
def _build_get_feature_xml(self, service_param: str, version_param: str, request_param: str): """ Returns the POST request XML for a GetFeature request Args: service_param (str): The service param version_param (str): The version param request_param (str): The request param Returns: xml (str): The xml document """ xml = "" format_param = self._get_POST_val("format") type_name_param = self._get_POST_val("typename") or self._get_POST_val( "typenames") filter_param = self._get_POST_val("filter") count_param = self._get_POST_val("count") or self._get_POST_val( "maxFeatures") resulttype_param = self._get_POST_val("count") or self._get_POST_val( "resultType") # check if the newer 'typeNames' instead of 'typeName' should be used type_name_identifier = "typeName" if version_param == OGCServiceVersionEnum.V_2_0_0.value or version_param == OGCServiceVersionEnum.V_2_0_2.value: type_name_identifier = "typeNames" reduced_ns_map = self._get_version_specific_namespaces( version_param, service_param) wfs_ns = reduced_ns_map["wfs"] root_attributes = { "service": service_param, "version": version_param, } if resulttype_param is not None: root_attributes["resultType"] = resulttype_param if format_param is not None: root_attributes["outputFormat"] = format_param if count_param is not None: param_tag = "maxFeatures" if version_param == OGCServiceVersionEnum.V_2_0_0.value or version_param == OGCServiceVersionEnum.V_2_0_2.value: param_tag = "count" root_attributes[param_tag] = count_param root = etree.Element(_tag="{" + wfs_ns + "}" + request_param, nsmap=reduced_ns_map, attrib=root_attributes) # create the xml filter object from the filter string parameter filter_xml = xml_helper.parse_xml(filter_param) if filter_xml is not None: filter_xml_root = filter_xml.getroot() for t_n_param in type_name_param.split(","): query_attributes = {type_name_identifier: t_n_param} query_elem = xml_helper.create_subelement( root, "{" + wfs_ns + "}" + "Query", attrib=query_attributes) # add the filter xml object as subobject to the query to use e.g. the spatial restriction xml_helper.add_subelement(query_elem, filter_xml_root) xml = xml_helper.xml_to_string(root) return xml
def parse_xml(self): """ Reads the needed data from the xml and writes to an ISOMetadata instance (self) Returns: nothing """ xml = self.raw_metadata xml_obj = xml_helper.parse_xml(xml) self.file_identifier = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:fileIdentifier/gco:CharacterString") self.character_set_code = xml_helper.try_get_attribute_from_xml_element( xml_elem=xml_obj, attribute="codeListValue", elem="//gmd:MD_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode") if self.file_identifier is None: self.file_identifier = uuid.uuid4() self.date_stamp = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date") self.last_change_date = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date") self.md_standard_name = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:metadataStandardName/gco:CharacterString") self.md_standard_version = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:metadataStandardVersion/gco:CharacterString") self._parse_xml_legal_dates(xml_obj) self._parse_xml_legal_reports(xml_obj) # try to transform the last_change_date into a datetime object try: self.last_change_date = parse(self.last_change_date, tzinfo=timezone.utc) except (ValueError, OverflowError, TypeError): # if this is not possible due to wrong input, just use the current time... self.last_change_date = timezone.now() self.hierarchy_level = xml_helper.try_get_attribute_from_xml_element( xml_obj, "codeListValue", "//gmd:MD_Metadata/gmd:hierarchyLevel/gmd:MD_ScopeCode") if self.hierarchy_level == "service": xpath_type = "srv:SV_ServiceIdentification" else: xpath_type = "gmd:MD_DataIdentification" self.title = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString" .format(xpath_type)) self._parse_xml_dataset_id(xml_obj, xpath_type) self.abstract = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:abstract/gco:CharacterString" .format(xpath_type)) keywords = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString" .format(xpath_type)) for keyword in keywords: if keyword.text is not None and keyword not in self.keywords: self.keywords.append( xml_helper.try_get_text_from_xml_element(keyword)) language = xml_helper.try_get_single_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:language/gmd:LanguageCode" .format(xpath_type)) if language and language.text is not None: self.language = xml_helper.try_get_text_from_xml_element(language) iso_categories = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:topicCategory/gmd:MD_TopicCategoryCode" .format(xpath_type)) if iso_categories: for iso_category in iso_categories: self.iso_categories.append( xml_helper.try_get_text_from_xml_element(iso_category)) # Get all values from <gmd:distributionInfo> which declares the distributionFormat formats = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("distributionFormat")) if formats: for format_elem in formats: # get the character value per format name_elem = xml_helper.try_get_single_element_from_xml( xml_elem=format_elem, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("name")) if name_elem is None: continue val = xml_helper.try_get_text_from_xml_element( xml_elem=name_elem, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) self.formats.append(val) self.download_link = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:onLine/gmd:CI_OnlineResource[gmd:function/gmd:CI_OnLineFunctionCode/@codeListValue="download"]/gmd:linkage/gmd:URL' ) self.transfer_size = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:transferSize/gco:Real' ) self.preview_image = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:graphicOverview/gmd:MD_BrowseGraphic/gmd:fileName/gco:CharacterString" .format(xpath_type)) try: self.bounding_box["min_x"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:westBoundLongitude/gco:Decimal".format(xpath_type))) self.bounding_box["min_y"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:southBoundLatitude/gco:Decimal".format(xpath_type))) self.bounding_box["max_x"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:eastBoundLongitude/gco:Decimal".format(xpath_type))) self.bounding_box["max_y"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:northBoundLatitude/gco:Decimal".format(xpath_type))) except TypeError: self.bounding_box = None self._parse_xml_polygons(xml_obj, xpath_type) self.tmp_extent_begin = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition" .format(xpath_type)) if self.tmp_extent_begin is None: self.tmp_extent_begin = "1900-01-01" self.tmp_extent_end = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition" .format(xpath_type)) if self.tmp_extent_end is None: self.tmp_extent_end = "1900-01-01" equivalent_scale = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:equivalentScale/gmd:MD_RepresentativeFraction/gmd:denominator/gco:Integer" .format(xpath_type)) ground_res = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:distance/gco:Distance" .format(xpath_type)) if equivalent_scale is not None and int(equivalent_scale) > 0: self.spatial_res_val = equivalent_scale self.spatial_res_type = "scaleDenominator" elif ground_res is not None and len(ground_res) > 0: self.spatial_res_val = ground_res self.spatial_res_type = "groundDistance" self.ref_system = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:code/gco:CharacterString" ) self.ref_system_version = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:version/gco:CharacterString" ) self.ref_system_authority = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:authority/gmd:CI_Citation/gmd:title/gco:CharacterString" ) epsg_api = EpsgApi() if self.ref_system is not None: self.ref_system = "EPSG:{}".format( epsg_api.get_subelements(self.ref_system).get("code")) # gmd:CI_OnLineFunctionCode dist_func_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_OnLineFunctionCode"), xml_obj) self.distribution_function = xml_helper.try_get_attribute_from_xml_element( dist_func_elem, "codeListValue", ) del dist_func_elem # gmd:MD_RepresentativeFraction fraction_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_RepresentativeFraction"), xml_obj) self.fraction_denominator = xml_helper.try_get_text_from_xml_element( fraction_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Integer")) del fraction_elem # gmd:useLimitation limit_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("useLimitation"), xml_obj) self.use_limitation = xml_helper.try_get_text_from_xml_element( limit_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) del limit_elem self.lineage = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:statement/gco:CharacterString" ) restriction_code_attr_val = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue' .format(xpath_type)) if len(restriction_code_attr_val) >= 2: legal_constraints = "" if restriction_code_attr_val[ 0] == 'license' and restriction_code_attr_val[ 1] == 'otherRestrictions': other_constraints = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString' .format(xpath_type)) for constraint in other_constraints: try: tmp_constraint = xml_helper.try_get_text_from_xml_element( xml_elem=constraint) constraint_json = json.loads(tmp_constraint) self.license_source_note = constraint_json.get( "quelle", None) self.license_json = constraint_json except ValueError: # no, this is not a json! # handle it is a normal text legal_constraints += tmp_constraint + ";" self.fees = legal_constraints self.access_constraints = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:accessConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString' .format(xpath_type)) self.responsible_party = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString' .format(xpath_type)) self.contact_person = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString' .format(xpath_type)) self.contact_phone = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:phone/gmd:CI_Telephone/gmd:voice/gco:CharacterString' .format(xpath_type)) self.contact_email = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString' .format(xpath_type)) update_frequency = xml_helper.try_get_attribute_from_xml_element( xml_elem=xml_obj, attribute="codeListValue", elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode' .format(xpath_type)) if update_frequency in self.valid_update_frequencies: self.update_frequency = update_frequency # inspire regulations regislations = {"inspire_rules": []} with open(INSPIRE_LEGISLATION_FILE, "r", encoding="utf-8") as _file: regislations = json.load(_file) for regislation in regislations["inspire_rules"]: reg = { "name": regislation.get("name", None), "date": regislation.get("date", "1900-01-01"), "pass": None, } statement = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:report/gmd:DQ_DomainConsistency/gmd:result/gmd:DQ_ConformanceResult[gmd:specification/gmd:CI_Citation/gmd:title/gco:CharacterString="{}" and gmd:specification/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date="{}"]/gmd:pass/gco:Boolean' .format(reg["name"], reg["date"])) statement_val = utils.resolve_boolean_attribute_val(statement) if statement_val is None: reg["pass"] = "******" self.inspire_interoperability = False else: reg["pass"] = statement_val # if only one regislation is not fullfilled, we do not have interoperability if not statement_val: self.inspire_interoperability = False self.interoperability_list.append(reg)
def harvest(self, task_id: str = None): """ Starts harvesting procedure Returns: """ # Create a pending task record for the database first! task_exists = PendingTask.objects.filter( description__icontains=self.metadata.title).exists() if task_exists: raise ProcessLookupError(_("Harvesting is currently performed")) else: async_task_id = task_id or self.metadata.id self.pending_task = PendingTask.objects.create( task_id=async_task_id, description=json.dumps({ "service": self.metadata.title, "phase": "Connecting...", }), progress=0, remaining_time=None, created_by=self.harvesting_group) # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore. all_persisted_metadata_identifiers = self.metadata.get_related_metadatas( filters={ 'to_metadatas__relation_type': MetadataRelationEnum.HARVESTED_THROUGH.value }).values_list("identifier", flat=True) # Use a set instead of list to increase lookup afterwards self.deleted_metadata.update(all_persisted_metadata_identifiers) # Perform the initial "hits" request to get an overview of how many data will be fetched hits_response, status_code = self._get_harvest_response( result_type="hits") descr = json.loads(self.pending_task.description) if status_code != 200: descr["phase"] = "Harvest failed: HTTP Code {}" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise ConnectionError( _("Harvest failed: Code {}\n{}").format( status_code, hits_response)) xml_response = xml_helper.parse_xml(hits_response) if xml_response is None: descr["phase"] = "Response is not a valid xml" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise ConnectionError( _("Response is not a valid xml: \n{}".format(hits_response))) try: total_number_to_harvest = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "numberOfRecordsMatched", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) except TypeError: csw_logger.error( "Malicious Harvest response: {}".format(hits_response)) descr[ "phase"] = "Harvest response incorrect. Inform an administrator!" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise AttributeError( _("Harvest response is missing important data!")) descr["phase"] = "Start harvesting..." self.pending_task.description = json.dumps(descr) self.pending_task.save() progress_step_per_request = float( self.max_records_per_request / total_number_to_harvest) * 100 # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and # simply end it there! processed_start_positions = set() t_start = time() number_rest_to_harvest = total_number_to_harvest number_of_harvested = 0 self.harvest_result.timestamp_start = timezone.now() self.harvest_result.save() page_cacher = PageCacher() # Run as long as we can fetch data and as long as the user does not abort the pending task! while self.pending_task is not None: processed_start_positions.add(self.start_position) # Get response next_response, status_code = self._get_harvest_response( result_type="results") found_entries = self._process_harvest_response(next_response) # Calculate time since loop started duration = time() - t_start number_rest_to_harvest -= self.max_records_per_request number_of_harvested += found_entries self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX) if self.start_position == 0 or self.start_position in processed_start_positions: # We are done! estimated_time_for_all = timezone.timedelta(seconds=0) break else: seconds_for_rest = (number_rest_to_harvest * (duration / number_of_harvested)) estimated_time_for_all = timezone.timedelta( seconds=seconds_for_rest) self._update_pending_task(self.start_position, total_number_to_harvest, progress_step_per_request, estimated_time_for_all) # Add HarvestResult infos self.harvest_result.timestamp_end = timezone.now() self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Delete Metadata records which could not be found in the catalogue anymore # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest! if self.pending_task is not None: deleted_metadatas = Metadata.objects.filter( identifier__in=self.deleted_metadata) deleted_metadatas.delete() self.pending_task.delete() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX)
def test_proxy_setting(self): return """ Tests whether the proxy can be set properly. Returns: """ metadata = self.service_wms.metadata # To avoid running celery in a separate test instance, we do not call the route. Instead we call the logic, which # is used to process access settings directly. async_process_securing_access( metadata.id, use_proxy=True, log_proxy=True, restrict_access=False, ) self.cap_doc_wms.refresh_from_db() doc_unsecured = self.cap_doc_wms.content doc_secured = Document.objects.get( metadata=metadata, document_type=DocumentEnum.CAPABILITY.value, is_original=False, ).content # Check for all operations if the uris has been changed! # Do not check for GetCapabilities, since we always change this uri during registration! # Make sure all versions can be matched by the code - the xml structure differs a lot from version to version service_version = metadata.get_service_version() if metadata.is_service_type(OGCServiceEnum.WMS): operations = [ OGCOperationEnum.GET_MAP.value, OGCOperationEnum.GET_FEATURE_INFO.value, OGCOperationEnum.DESCRIBE_LAYER.value, OGCOperationEnum.GET_LEGEND_GRAPHIC.value, OGCOperationEnum.GET_STYLES.value, OGCOperationEnum.PUT_STYLES.value, ] elif metadata.is_service_type(OGCServiceEnum.WFS): operations = [ OGCOperationEnum.GET_FEATURE.value, OGCOperationEnum.TRANSACTION.value, OGCOperationEnum.LOCK_FEATURE.value, OGCOperationEnum.DESCRIBE_FEATURE_TYPE.value, ] else: operations = [] # create xml documents from string documents and fetch only the relevant <Request> element for each xml_unsecured = xml_helper.parse_xml(doc_unsecured) request_unsecured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_unsecured) xml_secured = xml_helper.parse_xml(doc_secured) request_secured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_secured) for operation in operations: # Get <OPERATION> element operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured) operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured) if service_version == OGCServiceVersionEnum.V_1_0_0: if metadata.is_service_type(OGCServiceEnum.WMS): # The WMS 1.0.0 specification uses <OPERATION> instead of <GetOPERATION> for any operation element. operation = operation.replace("Get", "") # Get <OPERATION> element again operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured) operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured) # Version 1.0.0 holds the uris in the "onlineResource" attribute of <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured) get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured) post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured) post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured) online_res = "onlineResource" get_unsecured = xml_helper.try_get_attribute_from_xml_element(get_unsecured, online_res) get_secured = xml_helper.try_get_attribute_from_xml_element(get_secured, online_res) post_unsecured = xml_helper.try_get_attribute_from_xml_element(post_unsecured, online_res) post_secured = xml_helper.try_get_attribute_from_xml_element(post_secured, online_res) # Assert that all get/post elements are not None self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertContains(get_secured, HOST_NAME) self.assertContains(post_secured, HOST_NAME) elif service_version == OGCServiceVersionEnum.V_1_1_0 \ or service_version == OGCServiceVersionEnum.V_2_0_0 \ or service_version == OGCServiceVersionEnum.V_2_0_2: # Only WFS # Get <OPERATION> element again, since the operation is now identified using an attribute, not an element tag operation_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']", request_unsecured ) operation_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']", request_secured ) # Version 1.1.0 holds the uris in the href attribute of <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured) get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured) post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured) post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured) get_unsecured = xml_helper.get_href_attribute(get_unsecured) get_secured = xml_helper.get_href_attribute(get_secured) post_unsecured = xml_helper.get_href_attribute(post_unsecured) post_secured = xml_helper.get_href_attribute(post_secured) # Assert that all get/post elements are not None self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertContains(get_secured, HOST_NAME) self.assertContains(post_secured, HOST_NAME) elif service_version == OGCServiceVersionEnum.V_1_1_1 or service_version == OGCServiceVersionEnum.V_1_3_0: # Version 1.1.1 holds the uris in the <OnlineResource> element inside <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_unsecured ) get_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_secured ) post_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_unsecured ) post_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_secured ) get_unsecured = xml_helper.get_href_attribute(get_unsecured) get_secured = xml_helper.get_href_attribute(get_secured) post_unsecured = xml_helper.get_href_attribute(post_unsecured) post_secured = xml_helper.get_href_attribute(post_secured) # Assert that both (secure/unsecure) uris are None or none of them # This is possible for operations that are not supported by the service if get_secured is not None and get_unsecured is not None: self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertTrue(HOST_NAME in get_secured) if post_secured is not None and post_unsecured is not None: self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertTrue(HOST_NAME in post_secured) else: pass
def post(self, data): """ Wraps the post functionality of different request implementations (CURL, Requests). The response is written to self.content. Args: data (dict|byte): The post data body Returns: nothing """ try: # Automatically set the Content-Type header to xml, # if data is proper xml and no other Content-Type has been set, yet. check_xml = xml_helper.parse_xml(data) if check_xml is not None and self.additional_headers.get("Content-Type", None) is None: self.additional_headers["Content-Type"] = "application/xml" except ValueError: # In case of data not being xml, a value error will be thrown. We can skip the header setting in that case pass self.init_time = time.time() if self.connection_type is ConnectionEnum.CURL: # perform curl post pass elif self.connection_type is ConnectionEnum.REQUESTS: response = HttpResponse() # perform requests post if self.external_auth is None: response = requests.post( self._url, data, timeout=REQUEST_TIMEOUT, proxies=PROXIES, headers=self.additional_headers, verify=VERIFY_SSL_CERTIFICATES, ) elif self.external_auth.auth_type == "http_basic": response = requests.post( self._url, data, timeout=REQUEST_TIMEOUT, proxies=PROXIES, auth=HTTPBasicAuth( self.external_auth.username, self.external_auth.password), headers=self.additional_headers, verify=VERIFY_SSL_CERTIFICATES, ) elif self.external_auth.auth_type == "http_digest": response = requests.post( self._url, data, timeout=REQUEST_TIMEOUT, proxies=PROXIES, auth=HTTPDigestAuth( self.external_auth.username, self.external_auth.password), headers=self.additional_headers, verify=VERIFY_SSL_CERTIFICATES, ) self.status_code = response.status_code self.content = response.content self.http_external_headers = response.headers._store else: # Should not happen - we only accept REQUEST or CURL pass self.run_time = time.time() - self.init_time
def get_resource_capabilities(request: HttpRequest, md: Metadata): """ Logic for retrieving a capabilities document. If no capabilities document can be provided by the given parameter, a fallback document will be returned. Args: request: md: Returns: """ from service.tasks import async_increase_hits stored_version = md.get_service_version().value # move increasing hits to background process to speed up response time! # todo: after refactoring of md.increase_hits() maybe we don't need to start async tasks... test it!!! async_increase_hits.delay(md.id) if not md.is_active: return HttpResponse(content=SERVICE_DISABLED, status=423) # check that we have the requested version in our database version_param = None version_tag = None request_param = None request_tag = None use_fallback = None for k, v in request.GET.dict().items(): if k.upper() == "VERSION": version_param = v version_tag = k elif k.upper() == "REQUEST": request_param = v request_tag = k elif k.upper() == "FALLBACK": use_fallback = resolve_boolean_attribute_val(v) # No version parameter has been provided by the request - we simply use the one we have. if version_param is None or len(version_param) == 0: version_param = stored_version if version_param not in [data.value for data in OGCServiceVersionEnum]: # version number not valid return HttpResponse(content=PARAMETER_ERROR.format(version_tag), status=404) elif request_param is not None and request_param != OGCOperationEnum.GET_CAPABILITIES.value: # request not valid return HttpResponse(content=PARAMETER_ERROR.format(request_tag), status=404) else: pass if md.is_catalogue_metadata: doc = md.get_remote_original_capabilities_document(version_param) elif stored_version == version_param or use_fallback is True or not md.is_root(): # This is the case if # 1) a version is requested, which we have in our database # 2) the fallback parameter is set explicitly # 3) a subelement is requested, which normally do not have capability documents # We can check the cache for this document or we need to generate it! doc = md.get_current_capability_xml(version_param) else: # we have to fetch the remote document # fetch the requested capabilities document from remote - we do not provide this as our default (registered) one xml = md.get_remote_original_capabilities_document(version_param) tmp = xml_helper.parse_xml(xml) if tmp is None: raise ValueError("No xml document was retrieved. Content was :'{}'".format(xml)) # we fake the persisted service version, so the document setters will change the correct elements in the xml # md.service.service_type.version = version_param doc = Document( content=xml, metadata=md, document_type=DocumentEnum.CAPABILITY.value, is_original=True ) doc.set_capabilities_secured(auto_save=False) if md.use_proxy_uri: doc.set_proxy(True, auto_save=False, force_version=version_param) doc = doc.content return doc