class CSWTest(TestCase): """ Test CSW endpoint """ def setUp(self): """setup records and CSW""" self.csw = CatalogueServiceWeb(settings.REGISTRY_PYCSW['server']['url']) def tearDown(self): """shutdown endpoint and clean out records""" Service.objects.all().delete() def test_capabilities(self): """verify that HHypermap's CSW works properly""" # test that OGC:CSW URLs are identical to what is defined in settings for op in self.csw.operations: for method in op.methods: self.assertEqual(settings.REGISTRY_PYCSW['server']['url'], method['url'], 'Expected URL equality') # test that OGC:CSW 2.0.2 is supported self.assertEqual(self.csw.version, '2.0.2', 'Expected "2.0.2" as a supported version') # test that transactions are supported transaction = self.csw.get_operation_by_name('Transaction') harvest = self.csw.get_operation_by_name('Harvest') # test that HHypermap Service types are Harvestable for restype in ['http://www.opengis.net/wms', 'http://www.opengis.net/wmts/1.0', 'urn:x-esri:serviceType:ArcGIS:MapServer', 'urn:x-esri:serviceType:ArcGIS:ImageServer']: self.assertIn(restype, harvest.parameters['ResourceType']['values']) self.assertIn(restype, transaction.parameters['TransactionSchemas']['values'])
def get_datasets_from_csw(csw_endpoint, extended): print 'Retrieving data sets from %s' % csw_endpoint csw = CatalogueServiceWeb(csw_endpoint) csw.getrecords2(esn='full', maxrecords=1000) parsed_records = csw.records.values() if extended: #request the data a second time in an encoding with different information #manually parse the second response and join the information in to the existing records csw.getrecords2(esn='full', maxrecords=1000, outputschema=namespaces['gmd']) unparsed_response = csw.response root = etree.XML(unparsed_response) for record in parsed_records: record_id = record.identifier xpath_record_fragment = record_xpath_template.format(record_id) for attribute, xpath_fragment in attribute_to_xpath_fragment.iteritems(): xpath_attribute_fragment = xpath_record_fragment + xpath_fragment value = root.xpath(xpath_attribute_fragment, namespaces=namespaces) if len(value) != 0: #unpack list value = value[0] else: value = "" setattr(record, attribute, value) print(vars(record)) return { 'datasets' : parsed_records, }
def send_transaction_request(**kwargs): pycsw_url = "http://meta.iguess.list.lu/" try: csw = CatalogueServiceWeb(pycsw_url) except: log_error_msg("Unable to create Catalogue object") text = "" try: with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "iguess", "csw_template.xml")), "r") as r: text = r.read() except: log_error_msg("problem reading the xml template") template = Template(text) try: result = template.render(**kwargs) except: log_error_msg("error rendering xml transaction template") try: csw.transaction(ttype='insert', typename='gmd:MD_Metadata', record=result) except: log_error_msg("catalogue record already present")
def test_GetRecords_summary(self): csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5, esn="summary") nrecords = len(csw.records)
def refresh_harvested_records(context, database, table, url): """refresh / harvest all non-local records in repository""" from owslib.csw import CatalogueServiceWeb # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # get all harvested records count, records = repos.query(constraint={'where': 'source != "local"'}) if int(count) > 0: LOGGER.info('Refreshing %s harvested records', count) csw = CatalogueServiceWeb(url) for rec in records: source = \ getattr(rec, context.md_core_model['mappings']['pycsw:Source']) schema = \ getattr(rec, context.md_core_model['mappings']['pycsw:Schema']) identifier = \ getattr(rec, context.md_core_model['mappings']['pycsw:Identifier']) LOGGER.info('Harvesting %s (identifier = %s) ...', source, identifier) # TODO: find a smarter way of catching this if schema == 'http://www.isotc211.org/2005/gmd': schema = 'http://www.isotc211.org/schemas/2005/gmd/' try: csw.harvest(source, schema) LOGGER.info(csw.response) except Exception, err: LOGGER.warn(err)
def test_ows_interfaces_csw(): service = CatalogueServiceWeb(CSW_SERVICE_URL) # Check each service instance conforms to OWSLib interface service.alias = 'CSW' isinstance(service, owslib.catalogue.csw2.CatalogueServiceWeb) # URL attribute assert service.url == CSW_SERVICE_URL # version attribute assert service.version == '2.0.2' # Identification object assert hasattr(service, 'identification') # Check all ServiceIdentification attributes assert service.identification.type == 'CSW' for attribute in [ 'type', 'version', 'title', 'abstract', 'keywords', 'accessconstraints', 'fees' ]: assert hasattr(service.identification, attribute) # Check all ServiceProvider attributes for attribute in ['name', 'url', 'contact']: assert hasattr(service.provider, attribute) # Check all operations implement IOperationMetadata for op in service.operations: for attribute in ['name', 'formatOptions', 'methods']: assert hasattr(op, attribute) # Check all contents implement IContentMetadata as a dictionary # CSW does not work in this way so use dummy service.contents = {'dummy': '1'} isinstance(service.contents, dict)
def refresh_harvested_records(database, table, url): ''' refresh / harvest all non-local records in repository ''' from owslib.csw import CatalogueServiceWeb # get configuration and init repo connection REPOS = repository.Repository(database, CONTEXT, table=table) # get all harvested records COUNT, RECORDS = REPOS.query(constraint={'where': 'source != "local"'}) if int(COUNT) > 0: print 'Refreshing %s harvested records' % COUNT CSW = CatalogueServiceWeb(url) for rec in RECORDS: source = getattr(rec, CONTEXT.md_core_model['mappings']['pycsw:Source']) schema = getattr(rec, CONTEXT.md_core_model['mappings']['pycsw:Schema']) identifier = getattr(rec, CONTEXT.md_core_model['mappings']['pycsw:Identifier']) print 'Harvesting %s (identifier = %s) ...' % \ (source, identifier) # TODO: find a smarter way of catching this if schema == 'http://www.isotc211.org/2005/gmd': schema = 'http://www.isotc211.org/schemas/2005/gmd/' try: CSW.harvest(source, schema) print CSW.response except Exception, err: print err
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) csw.getrecords(qtype="dataset", outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
def test_GetRecords_brief(self): csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5, esn="brief") nrecords = len(csw.records)
def test_ows_interfaces_csw(): service = CatalogueServiceWeb(CSW_SERVICE_URL) # Check each service instance conforms to OWSLib interface service.alias = 'CSW' isinstance(service, owslib.csw.CatalogueServiceWeb) # URL attribute assert service.url == CSW_SERVICE_URL # version attribute assert service.version == '2.0.2' # Identification object assert hasattr(service, 'identification') # Check all ServiceIdentification attributes assert service.identification.type == 'CSW' for attribute in ['type', 'version', 'title', 'abstract', 'keywords', 'accessconstraints', 'fees']: assert hasattr(service.identification, attribute) # Check all ServiceProvider attributes for attribute in ['name', 'url', 'contact']: assert hasattr(service.provider, attribute) # Check all operations implement IOperationMetadata for op in service.operations: for attribute in ['name', 'formatOptions', 'methods']: assert hasattr(op, attribute) # Check all contents implement IContentMetadata as a dictionary # CSW does not work in this way so use dummy service.contents = {'dummy': '1'} isinstance(service.contents, dict)
def refresh_harvested_records(context, database, table, url): """refresh / harvest all non-local records in repository""" from owslib.csw import CatalogueServiceWeb # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # get all harvested records count, records = repos.query(constraint={"where": 'mdsource != "local"', "values": []}) if int(count) > 0: LOGGER.info("Refreshing %s harvested records", count) csw = CatalogueServiceWeb(url) for rec in records: source = getattr(rec, context.md_core_model["mappings"]["pycsw:Source"]) schema = getattr(rec, context.md_core_model["mappings"]["pycsw:Schema"]) identifier = getattr(rec, context.md_core_model["mappings"]["pycsw:Identifier"]) LOGGER.info("Harvesting %s (identifier = %s) ...", source, identifier) # TODO: find a smarter way of catching this if schema == "http://www.isotc211.org/2005/gmd": schema = "http://www.isotc211.org/schemas/2005/gmd/" try: csw.harvest(source, schema) LOGGER.info(csw.response) except Exception as err: LOGGER.warn(err) else: LOGGER.info("No harvested records")
def mdcount(self, cswurl, constraints=[], startrecord=0, maxharvest=10): """Queries the csw and count md matching constraints""" csw = CatalogueServiceWeb(cswurl, skip_caps=True) csw.getrecords2( esn="brief", constraints=constraints, startposition=startrecord, maxrecords=maxharvest, resulttype="hits" ) return csw.results
def test_csw_sends_headers(): """ Test that if headers are provided in the CSW class they are sent when performing HTTP requests (in this case for GetCapabilities) """ with mock.patch('owslib.util.requests.request', side_effect=RuntimeError) as mock_request: try: CatalogueServiceWeb('http://example.com/csw', version='2.0.2', headers={'User-agent': 'my-app/1.0'}) except RuntimeError: assert mock_request.called assert mock_request.call_args[1]['headers'] == { 'User-agent': 'my-app/1.0' } with mock.patch('owslib.util.requests.request', side_effect=RuntimeError) as mock_request: try: CatalogueServiceWeb('http://example.com/csw', version='3.0.0', headers={'User-agent': 'my-app/1.0'}) except RuntimeError: assert mock_request.called assert mock_request.call_args[1]['headers'] == { 'Accept': 'application/xml', 'User-agent': 'my-app/1.0' }
class GeocatCatalogueServiceWeb(object): def __init__(self, url): self.csw = CatalogueServiceWeb(url) self.schema = CHE_SCHEMA def get_geocat_id_from_csw(self, cqlquery=CQL_QUERY_DEFAULT, cqlterm=CQL_SEARCH_TERM_DEFAULT): harvest_query = PropertyIsEqualTo(cqlquery, cqlterm) nextrecord = 0 record_ids = [] while nextrecord is not None: self.csw.getrecords2(constraints=[harvest_query], maxrecords=50, startposition=nextrecord) if self.csw.response is None or self.csw.results['matches'] == 0: raise CswNotFoundError( "No dataset found for harvest query {}".format( harvest_query)) if self.csw.results['returned'] > 0: if self.csw.results['nextrecord'] > 0: nextrecord = self.csw.results['nextrecord'] else: nextrecord = None for id in self.csw.records.keys(): record_ids.append(id) return record_ids def get_record_by_id(self, geocat_id): self.csw.getrecordbyid(id=[geocat_id], outputschema=self.schema) csw_record_as_string = self.csw.response if csw_record_as_string: return csw_record_as_string else: return None
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) constraints = [PropertyIsEqualTo("dc:type", "dataset")] csw.getrecords2(constraints=constraints, outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
def setRepoParams(self, repoParams): self.metadataprefix = "csw" super(CSWRepository, self).setRepoParams(repoParams) try: self.cswrepo = CatalogueServiceWeb(self.url) except: self.cswrepo = None self.domain_metadata = []
def test_GetRecords(self): csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def _harvest_csw(csw, maxrecords=10, totalrecords=float("inf")): """ Step through CSW results, and if one seems to be a WMS or Arc REST service then register it """ stop = 0 flag = 0 src = CatalogueServiceWeb(csw.base_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results["nextrecord"] src.getrecords(esn="summary", startposition=startposition, maxrecords=maxrecords) max = min(src.results["matches"], totalrecords) if ( src.results["nextrecord"] == 0 or src.results["returned"] == 0 or src.results["nextrecord"] > max ): # end the loop, exhausted all records or max records to process stop = 1 break # harvest each record to destination CSW for record in list(src.records): record = src.records[record] known_types = {} for ref in record.references: if ( ref["scheme"] == "OGC:WMS" or "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower() ): print "WMS:%s" % ref["url"] known_types["WMS"] = ref["url"] if ( ref["scheme"] == "OGC:WFS" or "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower() ): print "WFS:%s" % ref["url"] known_types["WFS"] = ref["url"] if ref["scheme"] == "ESRI": print "ESRI:%s" % ref["url"] known_types["REST"] = ref["url"] if "WMS" in known_types: type = "OWS" if "WFS" in known_types else "WMS" try: _process_wms_service(known_types["WMS"], type, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["WMS"], str(e))) elif "REST" in known_types: try: _register_arcgis_url(ref["url"], None, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["REST"], str(e)))
def _harvest_csw(csw, maxrecords=10, totalrecords=float('inf')): """ Step through CSW results, and if one seems to be a WMS or Arc REST service then register it """ stop = 0 flag = 0 src = CatalogueServiceWeb(csw.base_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords( esn='summary', startposition=startposition, maxrecords=maxrecords) max = min(src.results['matches'], totalrecords) if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > max: # end the loop, exhausted all records or max records to process stop = 1 break # harvest each record to destination CSW for record in list(src.records): record = src.records[record] known_types = {} for ref in record.references: if ref["scheme"] == "OGC:WMS" or \ "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower(): print "WMS:%s" % ref["url"] known_types["WMS"] = ref["url"] if ref["scheme"] == "OGC:WFS" or \ "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower(): print "WFS:%s" % ref["url"] known_types["WFS"] = ref["url"] if ref["scheme"] == "ESRI": print "ESRI:%s" % ref["url"] known_types["REST"] = ref["url"] if "WMS" in known_types: type = "OWS" if "WFS" in known_types else "WMS" try: _process_wms_service( known_types["WMS"], type, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["WMS"], str(e))) elif "REST" in known_types: try: _register_arcgis_url(ref["url"], None, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["REST"], str(e)))
class HarvestNode(NgdsDataObject): """Stores information about harvest endpoints""" csw = None def __init__(self, url, **kwargs): # A URL must be given p = urlparse(url) self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path self.frequency = kwargs.get('frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly self.title = kwargs.get('title', 'No Title Was Given') # A title for bookkeeping self.node_admin_id = kwargs.get('node_admin_id', None) # Foreign Key to a responsible_party who maintains the remote node #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests def setup_csw(self): self.csw = CatalogueServiceWeb(self.url) def do_harvest(self): """Perform a harvest from another CSW server""" if self.csw == None: self.setup_csw() self.get_records() # Do the first GetRecords request ids = self.csw.records.keys() # Start an array to house all of the ids print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) while self.csw.results["nextrecord"] < self.csw.results["matches"] and self.csw.results["nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything self.get_records(self.csw.results["nextrecord"], self.csw.results["returned"]) # Get another set, starting from next_record from previous response ids += self.csw.records.keys() # Add new ids to the array print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) self.parse_records(ids) # Gather the records themselves def parse_records(self, ids): """Perform as many GetRecordById requests as needed""" print "Gathered %s IDs" % str(len(ids)) for record_id in ids: self.get_record_by_id(record_id) rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self) def get_record_by_id(self, record_id): """Get a single record, by ID""" params = { "id": [ record_id ], "outputschema": "http://www.isotc211.org/2005/gmd" } self.csw.getrecordbyid(**params) # Puts response in self.csw.records def get_records(self, start_position=1, max_records=1000): """Perform a GetRecords request""" params = { "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", "startposition": start_position, "maxrecords": max_records, "esn": "brief" } self.csw.getrecords(**params) # Puts results in self.csw.records
def __init__(self, community, url, schema, fromdate, clean, limit, outdir, verify): super().__init__(community, url, fromdate, clean, limit, outdir, verify) logging.captureWarnings(True) self.csw = CatalogueServiceWeb(self.url, auth=Authentication(verify=self.verify)) self._schema_type = schema self._schema = None self._constraints = None
def __init__(self, url, username=None, password=None): self.url = url self.username = username self.password = password try: self.remote = CatalogueServiceWeb( self.url, timeout=3600, lang='fr-FR', version='2.0.2', skip_caps=True, username=self.username, password=self.password) except Exception: raise CswReadError()
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def __init__(self, config): self._csw_url = config.get(self.__class__.__name__, 'csw_url') self._csw_usr = config.get(self.__class__.__name__, 'csw_usr') if \ config.has_option(self.__class__.__name__, 'csw_usr') else None self._csw_pwd = config.get(self.__class__.__name__, 'csw_pwd') if \ config.has_option(self.__class__.__name__, 'csw_pwd') else None from owslib.csw import CatalogueServiceWeb self._csw = CatalogueServiceWeb(url=self._csw_url, skip_caps=True)
def __init__(self): self.username = GEONETWORK_LOGIN self.password = GEONETWORK_PASSWORD self.remote = CatalogueServiceWeb(urljoin(GEONETWORK_URL, 'srv/fre/csw-publication'), timeout=GEONETWORK_TIMEOUT, lang='fr-FR', version='2.0.2', skip_caps=True, username=self.username, password=self.password)
def __init__(self, csw_url=None, timeout=None, debug=False): ''' Constructor for CSWUtils class @param csw_url: URL for CSW service. Defaults to value of CSWUtils.DEFAULT_CSW_URL @param timeout: Timeout in seconds. Defaults to value of CSWUtils.DEFAULT_TIMEOUT ''' csw_url = csw_url or CSWUtils.DEFAULT_CSW_URL timeout = timeout or CSWUtils.DEFAULT_TIMEOUT self.debug = debug self.csw = CatalogueServiceWeb(csw_url, timeout=timeout)
def getDataSetURI(anyText, CSWURL, BBox): """ Searches a given CSW server and returns metadata content for the datasets found. Arguments --------- - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.) - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.) - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional) """ csw = CatalogueServiceWeb(CSWURL, skip_caps=True) #Works with owslib version 0.8.6. csw.getrecords(keywords=[anyText], outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100) dataSetURIs = [['title', 'abstract', ['urls']]] for rec in csw.records: title = csw.records[rec].identification.title abstract = csw.records[rec].identification.abstract urls = [] try: for onlineresource in range( len(csw.records[rec].distribution.online)): urls.append( csw.records[rec].distribution.online[onlineresource].url) except AttributeError: print #pass else: print #pass for ident in range(len(csw.records[rec].identificationinfo)): try: for operation in range( len(csw.records[rec].identificationinfo[ident]. operations)): urls.append(csw.records[rec].identificationinfo[ident]. operations[0]['connectpoint'][0].url) except AttributeError: print #pass else: print #pass entry = [title, abstract, urls] dataSetURIs.append(entry) for i, dataset in enumerate(dataSetURIs): dataSetURIs[i][2] = [ uri.replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2] ] return dataSetURIs
def fetch(self, clean_url=True, timeout=120): # connect to csw source url = self.get_cleaned_url() if clean_url else self.url try: self.csw = CatalogueServiceWeb(url, timeout=timeout) except Exception as e: error = f'Error connection CSW: {e}' self.errors.append(error) logger.error(error) raise self.read_csw_info()
def check_for_property(self,p): """ Check that the specified dataset is available for the current URL. Returns boolean- false if the dataset is not found | set service url | ${CSWSERVICE_URL} | | ${dataset_exists} | Check for property | free parking | | Should be True | ${dataset_exists} | """ csw = CatalogueServiceWeb(self._url) csw.getdomain('Title', dtype='property') return p in csw.results.values()[0]
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def show_metadata(self): """show record metadata""" if not self.treeRecords.selectedItems(): return item = self.treeRecords.currentItem() if not item: return identifier = get_item_data(item, 'identifier') self.disable_ssl_verification = self.disableSSLVerification.isChecked() auth = None if self.disable_ssl_verification: try: auth = Authentication(verify=False) except NameError: pass try: with OverrideCursor(Qt.WaitCursor): cat = CatalogueServiceWeb( self.catalog_url, timeout=self.timeout, # spellok username=self.catalog_username, password=self.catalog_password, auth=auth) cat.getrecordbyid( [self.catalog.records[identifier].identifier]) except ExceptionReport as err: QMessageBox.warning( self, self.tr('GetRecords error'), self.tr('Error getting response: {0}').format(err)) return except KeyError as err: QMessageBox.warning(self, self.tr('Record parsing error'), self.tr('Unable to locate record identifier')) return record = cat.records[identifier] record.xml_url = cat.request crd = RecordDialog() metadata = render_template('en', self.context, record, 'record_metadata_dc.html') style = QgsApplication.reportStyleSheet() crd.textMetadata.document().setDefaultStyleSheet(style) crd.textMetadata.setHtml(metadata) crd.exec_()
def csw_ajax(request, *args, **kwargs): if request.method == 'GET': csw_url = request.session['csw_url'] user = request.session['user'] password = request.session['password'] keywords = request.session['keywords'] keywords_query = [fes.PropertyIsLike( 'csw:AnyText', '%%%s%%' % keywords)] if not csw_url: return HttpResponseServerError() try: csw = CatalogueServiceWeb( csw_url, username=user, password=password) result = csw.identification.type if result == 'CSW': offset = int(request.GET['offset']) per_page = int(request.GET['perPage']) csw.getrecords2( typenames='gmd:MD_Metadata', esn='full', outputschema='http://www.isotc211.org/2005/gmd', constraints=keywords_query, startposition=offset, maxrecords=per_page) result = [] for key in csw.records: rec = csw.records[key] res = {} if isinstance(rec, MD_Metadata): res['id'] = rec.identifier res['title'] = rec.identification.title res['inasafe_keywords'] = rec.identification.\ supplementalinformation if res['inasafe_keywords']: res['inasafe_layer'] = ( '<inasafe_keywords/>' in res['inasafe_keywords']) result.append(res) json_result = { 'records': result, 'queryRecordCount': csw.results['matches'], 'totalRecordCount': csw.results['matches'] } return JsonResponse(json_result, safe=False) except Exception as e: LOGGER.exception(e) return HttpResponseServerError() return HttpResponseServerError()
def test_GetRecordById(self): csw = CatalogueServiceWeb(service) tofetch = identifiers[:2] csw.getrecordbyid(tofetch, outputschema=GMD) nrecords = len(csw.records) assert nrecords == len(tofetch), nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident]) csw.getrecordbyid(["nonexistent"], outputschema=GMD) nrecords = len(csw.records) assert nrecords == 0, nrecords
def mdsearch(self, cswurl, esn="summary", constraints=[], startrecord=0, maxrecords=10, maxharvest=20): tstart = datetime.datetime.now() """Queries a csw to retrieve md ids matching constraints""" records = {} logging.info("searching max %s md from %s" % (maxharvest, cswurl)) csw = CatalogueServiceWeb(cswurl, skip_caps=True) first = True nextrecord = startrecord count = 0 while True: if not first: nextrecord = csw.results["nextrecord"] if count + maxrecords > maxharvest: maxrecords = maxharvest - count # retrieve exactly maxharvest md csw.getrecords2( esn=esn, constraints=constraints, startposition=nextrecord, maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA, ) if csw.results["matches"] == 0: logging.info("0 md found from %s" % cswurl) break else: first = False # fetch records for rec_id, rec in csw.records.iteritems(): count += 1 percent = int(float(count) / min(maxharvest, csw.results["matches"]) * 100) logging.debug("%s%% %s" % (percent, rec_id)) records[rec_id] = rec # get out if no records or beyond maxrecords if ( csw.results["nextrecord"] == 0 or csw.results["returned"] == 0 or csw.results["nextrecord"] > csw.results["matches"] or csw.results["nextrecord"] > maxharvest ): d = (datetime.datetime.now() - tstart).total_seconds() logging.info("%s md found from %s in %d s" % (count, cswurl, d)) break return records
def test_bbox(endpoints,bbox): for title,url in endpoints.iteritems(): try: csw = CatalogueServiceWeb(url, timeout=40) if "BBOX" in csw.filters.spatial_operators: filter_list = [fes.BBox(bbox)] try: csw.getrecords2(constraints=filter_list, maxrecords=1000) print("%s : Datasets = %d" % (title,len(csw.records.keys()))) except Exception: print "%s : BBOX Query FAILS" % title else: print "%s - BBOX Query NOT supported" % title except Exception: print "%s - Timed out" % title
def test_csw_skgeodsy(): c = CatalogueServiceWeb(SERVICE_URL) assert sorted([op.name for op in c.operations]) == [ 'DescribeRecord', 'GetCapabilities', 'GetRecordById', 'GetRecords', 'Transaction' ] grop = c.get_operation_by_name('GetRecords') assert grop.name == 'GetRecords' c.getrecords2(typenames='csw:Record gmd:MD_Metadata') assert c.results.get('returned') > 0 assert c.results.get('nextrecord') > 0 assert c.results.get('matches') > 0
def test_default_csw_connections(): """test that the default CSW connections work""" relpath = 'resources%sconnections-default.xml' % os.sep csw_connections_xml = options.base.plugin / relpath conns = etree.parse(csw_connections_xml) for conn in conns.findall('csw'): try: csw = CatalogueServiceWeb(conn.attrib.get('url')) info('Success: %s', csw.identification.title) csw.getrecords2() except Exception as err: raise ValueError('ERROR: %s', err)
def test_bbox(endpoints, bbox): for title, url in endpoints.iteritems(): try: csw = CatalogueServiceWeb(url, timeout=40) if "BBOX" in csw.filters.spatial_operators: filter_list = [fes.BBox(bbox)] try: csw.getrecords2(constraints=filter_list, maxrecords=1000) print("%s : Datasets = %d" % (title, len(csw.records.keys()))) except Exception: print "%s : BBOX Query FAILS" % title else: print "%s - BBOX Query NOT supported" % title except Exception: print "%s - Timed out" % title
def get_csw_record_by_id(self, csw_url, identifier): ''' Function to return OWSLib CSW record record from specified CSW URL using UUID as the search criterion ''' csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url csw.getrecordbyid(id=[identifier], esn='full', outputschema='own') # Ensure there is exactly one record found assert len( csw.records) > 0, 'No CSW records found for ID "%s"' % identifier assert len( csw.records) == 1, 'Multiple CSW records found for ID "%s"' % identifier return csw.records.values()[0]
def download_csw(db, harvest, csw_url, dest): ''' Downloads from a CSW endpoint. :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. :param url csw_url: URL to the CSW endpoint :param str dest: Folder to download to ''' if not os.path.exists(dest): os.makedirs(dest) csw = CatalogueServiceWeb(csw_url) # remove any records from past run db.Records.remove({"harvest_id": harvest['_id']}) count, errors = 0, 0 for csw in get_records(csw): for name, raw_rec in csw.records.items(): success = parse_csw_record(db, harvest, csw_url, dest, name, raw_rec) count += 1 if not success: errors += 1 return count, errors
def csw_query_metadata_by_id(csw_url, identifier, username=None, password=None): csw = CatalogueServiceWeb(csw_url, username=username, password=password) result = csw.identification.type record = None if result == 'CSW': constraints = [fes.PropertyIsEqualTo('dc:identifier', identifier)] csw.getrecords2(typenames='gmd:MD_Metadata', esn='full', outputschema='http://www.isotc211.org/2005/gmd', constraints=constraints) for key in csw.records: record = csw.records[key] return record
def generate_csw_connections_file(): """generate a CSW connections file from a flat file of CSW URLs""" filename = options.get('filename', False) if not filename: raise ValueError('path to file of CSW URLs required') conns = etree.Element('qgsCSWConnections') conns.attrib['version'] = '1.0' with open(filename) as connsfh: for line in connsfh: url = line.strip() if not url: # blank line continue try: csw = CatalogueServiceWeb(url) title = str(csw.identification.title) etree.SubElement(conns, 'csw', name=title, url=url) except Exception as err: error('ERROR on CSW %s: %s', url, err) with open('%s.xml' % filename, 'w') as connsxmlfh: connsxmlfh.write(etree.tostring(conns, encoding='utf-8'))
def run(self, csw=None, offset=0): if csw is None: endpoint = self.config['endpoint'] csw = CatalogueServiceWeb(endpoint) outputschema = self.config.get('outputschema', csw_ns['gmd']) typenames = self.config.get('typenames', 'csw:dataset') csw.getrecords(esn="full", outputschema=outputschema, typenames=typenames, startposition=offset, maxrecords=PAGE_SIZE) for record in csw.records.values(): self.queue(CSWDatasetCrawler, record=record) if csw.results.get('nextrecord') <= csw.results.get('matches'): print "OFFSET", offset self.queue(CSWCatalogCrawler, csw=csw, offset=csw.results.get('nextrecord'))
def test_good_post(self): csw = CatalogueServiceWeb(service) assert csw.identification.title, csw.identification.title ops = dict((x.name, x.methods) for x in csw.operations) assert "GetCapabilities" in ops assert "GetRecords" in ops assert "GetRecordById" in ops
def _get_csw(self): """convenience function to init owslib.csw.CatalogueServiceWeb""" # spellok self.disable_ssl_verification = self.disableSSLVerification.isChecked() auth = None if self.disable_ssl_verification: try: auth = Authentication(verify=False) except NameError: pass # connect to the server with OverrideCursor(Qt.WaitCursor): try: self.catalog = CatalogueServiceWeb( self.catalog_url, # spellok timeout=self.timeout, username=self.catalog_username, password=self.catalog_password, auth=auth) return True except ExceptionReport as err: msg = self.tr('Error connecting to service: {0}').format(err) except ValueError as err: msg = self.tr('Value Error: {0}').format(err) except Exception as err: msg = self.tr('Unknown Error: {0}').format(err) QMessageBox.warning(self, self.tr('CSW Connection error'), msg) return False
def __init__(self, url, schema, version='2.0.2', lang='en-US'): self.schema = schema self.catalog = CatalogueServiceWeb( url, lang, version, timeout=10, skip_caps=True ) self.metadata = dict.fromkeys([ 'id', 'name', 'title', 'url', 'author', 'maintainer', 'maintainer_email', 'license_url', 'version', 'service_url', 'service_type', 'notes', 'tags', 'metadata_url', 'metadata_raw', ])
def show_metadata(self): """show record metadata""" if not self.treeRecords.selectedItems(): return item = self.treeRecords.currentItem() if not item: return identifier = get_item_data(item, 'identifier') try: QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) cat = CatalogueServiceWeb(self.catalog_url, timeout=self.timeout, username=self.catalog_username, password=self.catalog_password) cat.getrecordbyid( [self.catalog.records[identifier].identifier]) except ExceptionReport as err: QApplication.restoreOverrideCursor() QMessageBox.warning(self, self.tr('GetRecords error'), self.tr('Error getting response: {0}').format(err)) return except KeyError as err: QMessageBox.warning(self, self.tr('Record parsing error'), self.tr('Unable to locate record identifier')) QApplication.restoreOverrideCursor() return QApplication.restoreOverrideCursor() record = cat.records[identifier] record.xml_url = cat.request crd = RecordDialog() metadata = render_template('en', self.context, record, 'record_metadata_dc.html') style = QgsApplication.reportStyleSheet() crd.textMetadata.document().setDefaultStyleSheet(style) crd.textMetadata.setHtml(metadata) crd.exec_()
def getDataSetURI(anyText, CSWURL, BBox): """ Searches a given CSW server and returns metadata content for the datasets found. Arguments --------- - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.) - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.) - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional) """ csw = CatalogueServiceWeb(CSWURL, skip_caps=True) #Works with owslib version 0.8.6. csw.getrecords(keywords=[anyText], outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100) dataSetURIs = [['title','abstract',['urls']]] for rec in csw.records: title=csw.records[rec].identification.title abstract=csw.records[rec].identification.abstract urls=[] try: for onlineresource in range(len(csw.records[rec].distribution.online)): urls.append(csw.records[rec].distribution.online[onlineresource].url) except AttributeError: print#pass else: print#pass for ident in range(len(csw.records[rec].identificationinfo)): try: for operation in range(len(csw.records[rec].identificationinfo[ident].operations)): urls.append(csw.records[rec].identificationinfo[ident].operations[0]['connectpoint'][0].url) except AttributeError: print#pass else: print#pass entry=[title,abstract,urls] dataSetURIs.append(entry) for i,dataset in enumerate(dataSetURIs): dataSetURIs[i][2]=[uri.replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2]] return dataSetURIs
def __init__(self, *args, **kwargs): self.url = kwargs['URL'] self.user = None self.password = None self.type = kwargs['ENGINE'].split('.')[-1] self.local = False self._group_ids = {} self._operation_ids = {} self.connected = False skip_caps = kwargs.get('skip_caps', True) CatalogueServiceWeb.__init__(self, url=self.url, skip_caps=skip_caps) upurl = urlparse(self.url) self.base = '%s://%s/' % (upurl.scheme, upurl.netloc) # User and Password are optional if 'USER'in kwargs: self.user = kwargs['USER'] if 'PASSWORD' in kwargs: self.password = kwargs['PASSWORD']
def __init__(self, *args, **kwargs): self.url = kwargs["URL"] self.user = None self.password = None self.type = kwargs["ENGINE"].split(".")[-1] self.local = False self._group_ids = {} self._operation_ids = {} self.connected = False skip_caps = kwargs.get("skip_caps", True) CatalogueServiceWeb.__init__(self, url=self.url, skip_caps=skip_caps) upurl = urlparse(self.url) self.base = "%s://%s/" % (upurl.scheme, upurl.netloc) # User and Password are optional if "USER" in kwargs: self.user = kwargs["USER"] if "PASSWORD" in kwargs: self.password = kwargs["PASSWORD"]
def get_uuid_from_title(csw_url, title): ''' Function to return OWSLib CSW record record from specified CSW URL using title as the search criterion Sample UUID: 221dcfd8-03d7-5083-e053-10a3070a64e3 ''' MAXRECORDS = 200 uuid = None csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url search_title = title.replace('_', '%') while search_title and len( title) - len(search_title) < 10 and not uuid: title_query = PropertyIsEqualTo( 'csw:Title', '%' + search_title + '%') csw.getrecords2( constraints=[title_query], esn='summary', maxrecords=MAXRECORDS) if not csw.records: # No records found # Broaden search by shortening title search_title = search_title[0:-1] else: uuid_list = [] # Strip all non-alphanumeric characters from title alphanumeric_title = re.sub('\W', '', title) while not uuid_list: uuid_list = [identifier for identifier in csw.records.keys( ) if alphanumeric_title in re.sub('\W', '', csw.records[identifier].title)] if len(uuid_list) == 1: # Unique match found uuid = uuid_list[0] logger.info( 'UUID %s found from title characters', uuid) break else: # Broaden search by shortening munged_title alphanumeric_title = alphanumeric_title[0:-1] return uuid
def getResource(endpoint = 'http://www.nodc.noaa.gov/geoportal/csw', bbox=None, keywords=None, maxrecords=1, service_type='opendap', verbose=None): if service_type == 'opendap': service_string='urn:x-esri:specification:ServiceType:OPeNDAP' if service_type == 'wms': service_string='urn:x-esri:specification:ServiceType:WMS' csw = CatalogueServiceWeb(endpoint,timeout=30) if keywords is not None: csw.getrecords(keywords=keywords, bbox=bbox, maxrecords=maxrecords) else : csw.getrecords(bbox=bbox, maxrecords=maxrecords) csw.records.keys() result = {} for i in csw.records.keys(): records=csw.records[i] resource = {} for key,rec in csw.records.iteritems(): url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None) print rec.references[0]['url'] if url is not None: resource[rec.title] = url result[i] = resource if verbose is not None: print 'endpoint: ', endpoint, '\n' , 'bbox: ', bbox, '\n' , 'keywords: ', keywords, '\n', 'maxrecords: ', maxrecords , '\n', 'service_type: ' , service_type return result
def __init__(self, csw_url=None, timeout=None, debug=False ): ''' Constructor for CSWUtils class @param csw_url: URL for CSW service. Defaults to value of CSWUtils.DEFAULT_CSW_URL @param timeout: Timeout in seconds. Defaults to value of CSWUtils.DEFAULT_TIMEOUT ''' csw_url = csw_url or CSWUtils.DEFAULT_CSW_URL timeout = timeout or CSWUtils.DEFAULT_TIMEOUT self.debug = debug self.csw = CatalogueServiceWeb(csw_url, timeout=timeout)
# # Query `apiso:ServiceType` on pycsw endpoint # In[1]: from owslib.csw import CatalogueServiceWeb from owslib import fes import numpy as np # In[2]: #endpoint = 'http://geoport.whoi.edu/csw' #endpoint = 'http://data.nodc.noaa.gov/geoportal/csw' endpoint = 'http://catalog.data.gov/csw-all' #endpoint = 'http://data.doi.gov/csw' csw = CatalogueServiceWeb(endpoint,timeout=60) print csw.version # In[3]: csw.get_operation_by_name('GetRecords').constraints # In[4]: try: csw.get_operation_by_name('GetDomain') csw.getdomain('apiso:ServiceType', 'property') print(csw.results['values']) except:
# <markdowncell> # #### Query each CSW catalog for revery model_name_filter constructed above # <codecell> from owslib.csw import CatalogueServiceWeb model_results = [] for x in range(len(model_name_filters)): model_name = known_model_strings[x] single_model_filter = model_name_filters[x] for url in known_csw_servers: try: csw = CatalogueServiceWeb(url, timeout=20) csw.getrecords2(constraints=[single_model_filter], maxrecords=1000, esn='full') for record, item in csw.records.items(): for d in item.references: result = dict(model=model_name, scheme=d['scheme'], url=d['url'], server=url) model_results.append(result) except BaseException as e: print "- FAILED: %s - %s" % (url, e.msg) # <markdowncell> # <div class="error"><strong>Paginating CSW Records</strong> - Some servers have a maximum amount of records you can retrieve at once. See: https://github.com/ioos/system-test/issues/126</div>