def test_GetRecords_summary(self): csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5, esn="summary") nrecords = len(csw.records)
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) csw.getrecords(qtype="dataset", outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
def test_GetRecords(self): csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def _harvest_csw(csw, maxrecords=10, totalrecords=float('inf')): """ Step through CSW results, and if one seems to be a WMS or Arc REST service then register it """ stop = 0 flag = 0 src = CatalogueServiceWeb(csw.base_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords( esn='summary', startposition=startposition, maxrecords=maxrecords) max = min(src.results['matches'], totalrecords) if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > max: # end the loop, exhausted all records or max records to process stop = 1 break # harvest each record to destination CSW for record in list(src.records): record = src.records[record] known_types = {} for ref in record.references: if ref["scheme"] == "OGC:WMS" or \ "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower(): print "WMS:%s" % ref["url"] known_types["WMS"] = ref["url"] if ref["scheme"] == "OGC:WFS" or \ "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower(): print "WFS:%s" % ref["url"] known_types["WFS"] = ref["url"] if ref["scheme"] == "ESRI": print "ESRI:%s" % ref["url"] known_types["REST"] = ref["url"] if "WMS" in known_types: type = "OWS" if "WFS" in known_types else "WMS" try: _process_wms_service( known_types["WMS"], type, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["WMS"], str(e))) elif "REST" in known_types: try: _register_arcgis_url(ref["url"], None, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["REST"], str(e)))
def _harvest_csw(csw, maxrecords=10, totalrecords=float("inf")): """ Step through CSW results, and if one seems to be a WMS or Arc REST service then register it """ stop = 0 flag = 0 src = CatalogueServiceWeb(csw.base_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results["nextrecord"] src.getrecords(esn="summary", startposition=startposition, maxrecords=maxrecords) max = min(src.results["matches"], totalrecords) if ( src.results["nextrecord"] == 0 or src.results["returned"] == 0 or src.results["nextrecord"] > max ): # end the loop, exhausted all records or max records to process stop = 1 break # harvest each record to destination CSW for record in list(src.records): record = src.records[record] known_types = {} for ref in record.references: if ( ref["scheme"] == "OGC:WMS" or "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower() ): print "WMS:%s" % ref["url"] known_types["WMS"] = ref["url"] if ( ref["scheme"] == "OGC:WFS" or "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower() ): print "WFS:%s" % ref["url"] known_types["WFS"] = ref["url"] if ref["scheme"] == "ESRI": print "ESRI:%s" % ref["url"] known_types["REST"] = ref["url"] if "WMS" in known_types: type = "OWS" if "WFS" in known_types else "WMS" try: _process_wms_service(known_types["WMS"], type, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["WMS"], str(e))) elif "REST" in known_types: try: _register_arcgis_url(ref["url"], None, None, None, parent=csw) except Exception, e: logger.error("Error registering %s:%s" % (known_types["REST"], str(e)))
class HarvestNode(NgdsDataObject): """Stores information about harvest endpoints""" csw = None def __init__(self, url, **kwargs): # A URL must be given p = urlparse(url) self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path self.frequency = kwargs.get('frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly self.title = kwargs.get('title', 'No Title Was Given') # A title for bookkeeping self.node_admin_id = kwargs.get('node_admin_id', None) # Foreign Key to a responsible_party who maintains the remote node #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests def setup_csw(self): self.csw = CatalogueServiceWeb(self.url) def do_harvest(self): """Perform a harvest from another CSW server""" if self.csw == None: self.setup_csw() self.get_records() # Do the first GetRecords request ids = self.csw.records.keys() # Start an array to house all of the ids print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) while self.csw.results["nextrecord"] < self.csw.results["matches"] and self.csw.results["nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything self.get_records(self.csw.results["nextrecord"], self.csw.results["returned"]) # Get another set, starting from next_record from previous response ids += self.csw.records.keys() # Add new ids to the array print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) self.parse_records(ids) # Gather the records themselves def parse_records(self, ids): """Perform as many GetRecordById requests as needed""" print "Gathered %s IDs" % str(len(ids)) for record_id in ids: self.get_record_by_id(record_id) rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self) def get_record_by_id(self, record_id): """Get a single record, by ID""" params = { "id": [ record_id ], "outputschema": "http://www.isotc211.org/2005/gmd" } self.csw.getrecordbyid(**params) # Puts response in self.csw.records def get_records(self, start_position=1, max_records=1000): """Perform a GetRecords request""" params = { "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", "startposition": start_position, "maxrecords": max_records, "esn": "brief" } self.csw.getrecords(**params) # Puts results in self.csw.records
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def getDataSetURI(anyText, CSWURL, BBox): """ Searches a given CSW server and returns metadata content for the datasets found. Arguments --------- - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.) - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.) - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional) """ csw = CatalogueServiceWeb(CSWURL, skip_caps=True) #Works with owslib version 0.8.6. csw.getrecords(keywords=[anyText], outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100) dataSetURIs = [['title', 'abstract', ['urls']]] for rec in csw.records: title = csw.records[rec].identification.title abstract = csw.records[rec].identification.abstract urls = [] try: for onlineresource in range( len(csw.records[rec].distribution.online)): urls.append( csw.records[rec].distribution.online[onlineresource].url) except AttributeError: print #pass else: print #pass for ident in range(len(csw.records[rec].identificationinfo)): try: for operation in range( len(csw.records[rec].identificationinfo[ident]. operations)): urls.append(csw.records[rec].identificationinfo[ident]. operations[0]['connectpoint'][0].url) except AttributeError: print #pass else: print #pass entry = [title, abstract, urls] dataSetURIs.append(entry) for i, dataset in enumerate(dataSetURIs): dataSetURIs[i][2] = [ uri.replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2] ] return dataSetURIs
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def run(self, csw=None, offset=0): if csw is None: endpoint = self.config['endpoint'] csw = CatalogueServiceWeb(endpoint) outputschema = self.config.get('outputschema', csw_ns['gmd']) typenames = self.config.get('typenames', 'csw:dataset') csw.getrecords(esn="full", outputschema=outputschema, typenames=typenames, startposition=offset, maxrecords=PAGE_SIZE) for record in csw.records.values(): self.queue(CSWDatasetCrawler, record=record) if csw.results.get('nextrecord') <= csw.results.get('matches'): print "OFFSET", offset self.queue(CSWCatalogCrawler, csw=csw, offset=csw.results.get('nextrecord'))
def getDataSetURI(anyText, CSWURL, BBox): """ Searches a given CSW server and returns metadata content for the datasets found. Arguments --------- - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.) - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.) - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional) """ csw = CatalogueServiceWeb(CSWURL, skip_caps=True) #Works with owslib version 0.8.6. csw.getrecords(keywords=[anyText], outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100) dataSetURIs = [['title','abstract',['urls']]] for rec in csw.records: title=csw.records[rec].identification.title abstract=csw.records[rec].identification.abstract urls=[] try: for onlineresource in range(len(csw.records[rec].distribution.online)): urls.append(csw.records[rec].distribution.online[onlineresource].url) except AttributeError: print#pass else: print#pass for ident in range(len(csw.records[rec].identificationinfo)): try: for operation in range(len(csw.records[rec].identificationinfo[ident].operations)): urls.append(csw.records[rec].identificationinfo[ident].operations[0]['connectpoint'][0].url) except AttributeError: print#pass else: print#pass entry=[title,abstract,urls] dataSetURIs.append(entry) for i,dataset in enumerate(dataSetURIs): dataSetURIs[i][2]=[uri.replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2]] return dataSetURIs
def harvest(source, dst): maxrecords = options["max"] if options["max"] == 0 or None: maxrecords = 10 stop = 0 flag = 0 src = CatalogueServiceWeb(source) dest = CatalogueServiceWeb(dst) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results["nextrecord"] src.getrecords(esn="brief", startposition=startposition, maxrecords=maxrecords) print(src.results) if (src.results["nextrecord"] == 0 or src.results["returned"] == 0 or src.results["nextrecord"] > src.results["matches"]): # end the loop, exhausted all records stop = 1 break # harvest each record to destination CSW for i in list(src.records): source = "%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s" % ( sys.argv[1], i, ) dest.harvest(source=source, resourcetype="http://www.isotc211.org/2005/gmd") # print dest.request # print dest.response flag = 1
def getResource(endpoint = 'http://www.nodc.noaa.gov/geoportal/csw', bbox=None, keywords=None, maxrecords=1, service_type='opendap', verbose=None): if service_type == 'opendap': service_string='urn:x-esri:specification:ServiceType:OPeNDAP' if service_type == 'wms': service_string='urn:x-esri:specification:ServiceType:WMS' csw = CatalogueServiceWeb(endpoint,timeout=30) if keywords is not None: csw.getrecords(keywords=keywords, bbox=bbox, maxrecords=maxrecords) else : csw.getrecords(bbox=bbox, maxrecords=maxrecords) csw.records.keys() result = {} for i in csw.records.keys(): records=csw.records[i] resource = {} for key,rec in csw.records.iteritems(): url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None) print rec.references[0]['url'] if url is not None: resource[rec.title] = url result[i] = resource if verbose is not None: print 'endpoint: ', endpoint, '\n' , 'bbox: ', bbox, '\n' , 'keywords: ', keywords, '\n', 'maxrecords: ', maxrecords , '\n', 'service_type: ' , service_type return result
def search(self): keyword = request.params.get('keyword') if keyword is None: abort(400, 'no keyword') lang = request.params.get('lang') if not lang or lang not in supported_langs: abort(400, 'unknown lang') self._lang = lang try: limit = int(request.params.get('limit', 150)) except ValueError: abort(400, 'limit param cannot be parsed to an integer') csw = CatalogueServiceWeb(geocat_url, timeout=60) cql = "keyword = '" + keyword + "'" query = request.params.get('query') if query is not None: cql = cql + " AND AnyText = '" + query + "'" csw.getrecords(cql=cql, maxrecords=limit, outputschema='http://www.isotc211.org/2005/gmd') if not csw.results.has_key('matches'): abort(502, 'invalid response from GeoCat') # initialize the results object results = {'results': []} if csw.results['matches'] <= 0: return results for record in csw.records.values(): # verify that the record is associated with a supported # language, and skip it if it's not if not record.language in supported_langs.values(): log.debug('record language not supported, skip record') continue # read layer type, layer name, and layer url first. If the record # does not represent a supported OGC resource, or if it doesn't # have a valid name and url, we skip it online_resource = self._read_layer_ogc_online_resource(record) if online_resource is None or None in online_resource: log.debug('record does not represent a supported OGC ' \ 'resource, skip record') continue layer_type, layer_param, layer_name, layer_url = online_resource parsed_url = urlparse(layer_url) if parsed_url.scheme is None or \ not parsed_url.scheme.startswith('http'): log.debug('layer url scheme is not HTTP, skip record') continue # read other properties id = record.identifier name = self._read_layer_name(record) alternate_title = self._read_layer_alternate_title(record) extent = self._read_layer_extent(record) data_provider, data_provider_link = \ self._read_layer_data_provider(record) abstract = self._read_layer_abstract(record) resolution_distance = \ self._read_layer_resolution_distance(record) equivalent_scales = \ self._read_layer_equivalent_scales(record) web_links = self._read_layer_online(record, 'Webaddresse (URL)') thematic_geoportals = \ self._read_layer_online(record, 'specialised geoportal') downloads = \ self._read_layer_online(record, 'Webaddresse zum Download') date = self._read_layer_date(record) legal_constraints = self._read_legal_constraints(record) copyright, copyright_link = \ self._read_layer_copyright(record) # create a dict with the properties and append # it to the results list layer = dict(id=id, name=name, layertype=layer_type, url=layer_url, alternate_title=alternate_title, extent=extent, data_provider=data_provider, data_provider_link=data_provider_link, abstract=abstract, resolution_distance=resolution_distance, equivalent_scales=equivalent_scales, web_links=web_links, thematic_geoportals=thematic_geoportals, downloads=downloads, date=date, legal_constraints=legal_constraints, copyright=copyright, copyright_link=copyright_link) layer[layer_param] = layer_name results['results'].append(layer); return results
class CkanMetadata(object): """ Provides general access to CSW for CKAN """ def __init__(self, url, schema, version='2.0.2', lang='en-US'): self.schema = schema self.catalog = CatalogueServiceWeb(url, lang, version, timeout=10, skip_caps=True) self.metadata = dict.fromkeys([ 'id', 'name', 'title', 'url', 'author', 'maintainer', 'maintainer_email', 'license_url', 'version', 'service_url', 'service_type', 'notes', 'tags', 'metadata_url', 'metadata_raw', ]) def get_by_search(self, searchterm, propertyname='csw:AnyText'): """ Returns the found csw dataset with the given searchterm """ self.catalog.getrecords(keywords=[searchterm], propertyname=propertyname) if (self.catalog.response is None or self.catalog.results['matches'] == 0): raise DatasetNotFoundError( "No dataset for the given searchterm '%s' (%s) found" % (searchterm, propertyname)) return self.catalog.records def get_by_id(self, id): """ Returns the csw dataset with the given id """ self.catalog.getrecordbyid(id=[id], outputschema=self.schema) return self.catalog.response def get_id_by_dataset_name(self, dataset_name): """ Returns the id of a dataset identified by it's name. If there are multiple datasets with the given name, only the id of the first one is returned. """ dataset_list = self.get_by_search(dataset_name, 'title') return dataset_list.itervalues().next().identifier def get_attribute(self, ckan_attribute, dataset_name=None): """ Abstract method to define the mapping of a ckan attribute to a csw attribute """ raise NotImplementedError def get_xml(self, id): dataset_xml_string = self.get_by_id(id) if dataset_xml_string is None: raise DatasetNotFoundError("Dataset with id %s not found" % id) return dataset_xml_string def get_ckan_metadata_by_id(self, id, language='de'): log.debug("Dataset ID: %s" % id) dataset_xml = etree.fromstring(self.get_xml(id)) for key in self.metadata: log.debug("Metadata key: %s" % key) attribute = self.get_attribute(key) self.metadata[key] = attribute.get_value(xml=dataset_xml, lang=language) return self.metadata def get_ckan_metadata(self, dataset_name, language='de'): """ Returns the requested dataset mapped to CKAN attributes """ id = self.get_id_by_dataset_name(dataset_name) return self.get_ckan_metadata_by_id(id, language)
# -*- coding: utf-8 -*- """ Created on Sun May 19 23:44:09 2013 @author: maurizio napolitano """ import csv from owslib.csw import CatalogueServiceWeb csw = CatalogueServiceWeb('http://www.territorio.provincia.tn.it/geoportlet/srv/eng/csw') csw.getrecords() data = {} records = csw.records for r in records: data[r]=records[r] matches = csw.results['matches'] returned = csw.results['returned'] startposition = 0 while (returned != 0): records = csw.records for r in records: data[r]=records[r] csw.getrecords(startposition=csw.results['nextrecord']) returned = csw.results['returned'] with open('datafromgeocatalogpat.csv', 'wb') as csvfile: csvoutput = csv.writer(csvfile, delimiter=';',quoting=csv.QUOTE_ALL) firstrow = True for d in data: fields = {} fields = data[d].__dict__ if (firstrow):
# <codecell> #This cell examines and provides information on the number of model records # available via each catalog endpoint records1 = [] titles1 = [] lenrecords1 = [] lentitles1 = [] k = 0 for endpoint in endpoints[:3]: csw = CatalogueServiceWeb(endpoint, timeout=60) for model_string in model_strings: try: csw.getrecords(keywords=[model_string], maxrecords=60, esn='full') records1.append(csw.results) except Exception, ex1: records1.append('Error') try: for rec in csw.records: titles1.append(csw.records[rec].title) except Exception, ex1: titles1.append('Error') lentitles1.append(len(titles1[-1])) lenrecords1.append(len(records1[-1])) zipvar1 = zip(endpoints, records1, lenrecords1, titles1, lentitles1) df = DataFrame( data=zipvar1, columns=['endpoints', 'records1', 'lenrecords1', 'titles1', 'lentitles1'])
class CkanMetadata(object): """ Provides general access to CSW for CKAN """ def __init__(self, url, schema, version='2.0.2', lang='en-US'): self.schema = schema self.catalog = CatalogueServiceWeb( url, lang, version, timeout=10, skip_caps=True ) self.metadata = dict.fromkeys([ 'id', 'name', 'title', 'url', 'author', 'maintainer', 'maintainer_email', 'license_url', 'version', 'service_url', 'service_type', 'notes', 'tags', 'metadata_url', 'metadata_raw', ]) def get_by_search(self, searchterm, propertyname='csw:AnyText'): """ Returns the found csw dataset with the given searchterm """ self.catalog.getrecords( keywords=[searchterm], propertyname=propertyname ) if (self.catalog.response is None or self.catalog.results['matches'] == 0): raise DatasetNotFoundError( "No dataset for the given searchterm '%s' (%s) found" % (searchterm, propertyname) ) return self.catalog.records def get_by_id(self, id): """ Returns the csw dataset with the given id """ self.catalog.getrecordbyid(id=[id], outputschema=self.schema) return self.catalog.response def get_id_by_dataset_name(self, dataset_name): """ Returns the id of a dataset identified by it's name. If there are multiple datasets with the given name, only the id of the first one is returned. """ dataset_list = self.get_by_search(dataset_name, 'title') return dataset_list.itervalues().next().identifier def get_attribute(self, ckan_attribute, dataset_name=None): """ Abstract method to define the mapping of a ckan attribute to a csw attribute """ raise NotImplementedError def get_xml(self, id): dataset_xml_string = self.get_by_id(id) if dataset_xml_string is None: raise DatasetNotFoundError("Dataset with id %s not found" % id) return dataset_xml_string def get_ckan_metadata_by_id(self, id, language='de'): log.debug("Dataset ID: %s" % id) dataset_xml = etree.fromstring(self.get_xml(id)) for key in self.metadata: log.debug("Metadata key: %s" % key) attribute = self.get_attribute(key) self.metadata[key] = attribute.get_value( xml=dataset_xml, lang=language ) return self.metadata def get_ckan_metadata(self, dataset_name, language='de'): """ Returns the requested dataset mapped to CKAN attributes """ id = self.get_id_by_dataset_name(dataset_name) return self.get_ckan_metadata_by_id(id, language)
def _parse_csw(context, repos, record, identifier, pagesize=10): from owslib.csw import CatalogueServiceWeb recobjs = [] # records serviceobj = repos.dataset() # if init raises error, this might not be a CSW md = CatalogueServiceWeb(record) # generate record of service instance _set(context, serviceobj, 'pycsw:Identifier', identifier) _set(context, serviceobj, 'pycsw:Typename', 'csw:Record') _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2') _set(context, serviceobj, 'pycsw:MdSource', record) _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now()) _set(context, serviceobj, 'pycsw:XML', md.response) _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml)) _set(context, serviceobj, 'pycsw:Type', 'service') _set(context, serviceobj, 'pycsw:Title', md.identification.title) _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract) _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords)) _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Publisher', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name) _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name) _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints) _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees) _set(context, serviceobj, 'pycsw:Source', record) _set(context, serviceobj, 'pycsw:Format', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version) _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations])) _set(context, serviceobj, 'pycsw:CouplingType', 'tight') recobjs.append(serviceobj) # get all supported typenames of metadata # so we can harvest the entire CSW csw_typenames = 'csw:Record' for op in md.operations: if op.name == 'GetRecords': try: csw_typenames = ' '.join(op.parameters['typeNames']['values']) except: # stay with default pass # now get all records # get total number of records to loop against try: md.getrecords(typenames=csw_typenames, resulttype='hits') matches = md.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(md.response) if pagesize > matches: pagesize = matches LOGGER.debug('Harvesting %d CSW records' % matches) # loop over all catalogue records incrementally for r in range(1, matches, pagesize): try: md.getrecords(typenames=csw_typenames, startposition=r, maxrecords=pagesize) except Exception, err: # this is a CSW, but server rejects query raise RuntimeError(md.response) for k, v in md.records.iteritems(): recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
def _parse_csw(context, repos, record, identifier, pagesize=10): from owslib.csw import CatalogueServiceWeb recobjs = [] # records serviceobj = repos.dataset() # if init raises error, this might not be a CSW md = CatalogueServiceWeb(record) # generate record of service instance _set(context, serviceobj, 'pycsw:Identifier', identifier) _set(context, serviceobj, 'pycsw:Typename', 'csw:Record') _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2') _set(context, serviceobj, 'pycsw:MdSource', record) _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now()) _set(context, serviceobj, 'pycsw:XML', md.response) _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml)) _set(context, serviceobj, 'pycsw:Type', 'service') _set(context, serviceobj, 'pycsw:Title', md.identification.title) _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract) _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords)) _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Publisher', md.provider.name) _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name) _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name) _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints) _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees) _set(context, serviceobj, 'pycsw:Source', record) _set(context, serviceobj, 'pycsw:Format', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version) _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations])) _set(context, serviceobj, 'pycsw:CouplingType', 'tight') links = [ '%s,OGC-CSW Catalogue Service for the Web,OGC:CSW,%s' % (identifier, md.url), '%s,OGC-CSW Capabilities service (ver 2.0.2),OGC:CSW-2.0.2-http-get-capabilities,%s' % (identifier, md.url), ] _set(context, serviceobj, 'pycsw:Links', '^'.join(links)) recobjs.append(serviceobj) # get all supported typenames of metadata # so we can harvest the entire CSW csw_typenames = 'csw:Record' # now get all records # get total number of records to loop against try: md.getrecords(typenames=csw_typenames, resulttype='hits') matches = md.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(md.response) if pagesize > matches: pagesize = matches LOGGER.debug('Harvesting %d CSW records' % matches) # loop over all catalogue records incrementally for r in range(1, matches, pagesize): try: md.getrecords(typenames=csw_typenames, startposition=r, maxrecords=pagesize) except Exception, err: # this is a CSW, but server rejects query raise RuntimeError(md.response) for k, v in md.records.iteritems(): recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'onm', supplier_title = u"Office national de l'eau et des milieux aquatiques", target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_site_url = u'http://opendata-sie-back.brgm-rec.fr/geosource/srv/eng/csw' # Recette environment if not args.dry_run: harvester.retrieve_target() # Retrieve short infos of packages in source. csw = CatalogueServiceWeb(source_site_url) bad_indexes = [] index = 0 limit = 50 record_by_id = {} while True: try: csw.getrecords(maxrecords = limit, startposition = index) except: if limit == 1: # Bad record found. Skip it. bad_indexes.append(index) index += 1 limit = 50 else: # Retry one by one to find bad record and skip it. limit = 1 else: for id, record in csw.records.iteritems(): record_by_id[id] = record next_index = csw.results['nextrecord'] if next_index <= index: break index = next_index # Retrieve packages from source. formats = set() groups = [ harvester.upsert_group(dict( title = u'Environnement', )), ] temporals = set() types = set() for record_id in record_by_id.iterkeys(): csw.getrecordbyid(id = [record_id]) record = csw.records[record_id] formats.add(record.format) temporals.add(record.temporal) types.add(record.type) if not args.dry_run: package = dict( license_id = u'fr-lo', notes = u'\n\n'.join( fragment for fragment in ( record.abstract, record.source, ) if fragment ), resources = [ dict( description = uri.get('description') or None, format = { 'CSV': 'CSV', 'ESRI Shapefile': 'SHP', 'MIF / MID': 'MIF / MID', # TODO? 'RDF': 'RDF', 'SHP': 'SHP', 'Txt': 'TXT', 'WMS': 'WMS', }.get(record.format, record.format), name = uri.get('name'), url = uri['url'], ) for uri in record.uris ], tags = [ dict(name = strings.slugify(subject)) for subject in record.subjects ], # territorial_coverage = TODO # Datasets have a granularity of either "commune" or "poi". Since the both are indexed the same way, use # "poi". territorial_coverage_granularity = 'poi', title = record.title, # url = u'URL TODO', ) log.info(u'Harvested package: {}'.format(package['title'])) harvester.add_package(package, harvester.supplier, record.title, package['url'], groups = groups) if not args.dry_run: harvester.update_target() log.info(u'Formats: {}'.format(sorted(formats))) log.info(u'Temporals: {}'.format(sorted(temporals))) log.info(u'Types: {}'.format(sorted(types))) return 0
def _parse_csw(context, repos, record, identifier, pagesize=10): from owslib.csw import CatalogueServiceWeb recobjs = [] # records serviceobj = repos.dataset() md = CatalogueServiceWeb(record) # generate record of service instance _set(context, serviceobj, 'pycsw:Identifier', identifier) _set(context, serviceobj, 'pycsw:Typename', 'csw:Record') _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2') _set(context, serviceobj, 'pycsw:MdSource', record) _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now()) _set(context, serviceobj, 'pycsw:XML', md.response) _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml)) _set(context, serviceobj, 'pycsw:Type', 'service') _set(context, serviceobj, 'pycsw:Title', md.identification.title) _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract) _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords)) _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Publisher', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name) _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name) _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints) _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees) _set(context, serviceobj, 'pycsw:Source', record) _set(context, serviceobj, 'pycsw:Format', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version) _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations])) _set(context, serviceobj, 'pycsw:CouplingType', 'tight') recobjs.append(serviceobj) # get all supported typenames of metadata # so we can harvest the entire CSW csw_typenames = 'csw:Record' for op in md.operations: if op.name == 'GetRecords': try: csw_typenames = ' '.join(op.parameters['typeNames']['values']) except: # stay with default pass # now get all records # get total number of records to loop against md.getrecords(typenames=csw_typenames, resulttype='hits') matches = md.results['matches'] if pagesize > matches: pagesize = matches # loop over all catalogue records incrementally for r in range(1, matches, pagesize): md.getrecords(typenames=csw_typenames, startposition=r, maxrecords=pagesize) for k, v in md.records.iteritems(): recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml))) return recobjs
def test_GetRecords_summary(self): csw = CatalogueServiceWeb(service) csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5, esn="summary") nrecords = len(csw.records)
csw.results # Search for keywords like ‘birds’ or ‘fowl’ birds_query_like = PropertyIsLike('dc:subject', '%birds%') fowl_query_like = PropertyIsLike('dc:subject', '%fowl%') csw.getrecords2(constraints=[birds_query_like, fowl_query_like]) csw.results # Search for a specific record: csw.getrecordbyid(id=['9250AA67-F3AC-6C12-0CB9-0662231AA181']) c.records['9250AA67-F3AC-6C12-0CB9-0662231AA181'].title # Search with a CQL query csw.getrecords(cql='csw:AnyText like "%birds%"') csw.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("file.xml").read()) # update ALL records csw.transaction(ttype='update', typename='csw:Record', propertyname='dc:title', propertyvalue='New Title') # update records satisfying keywords filter csw.transaction(ttype='update', typename='csw:Record', propertyname='dc:title', propertyvalue='New Title',
print("Usage: %s <source_catalogue_url> <destination_catalogue_url> [maxrecords]" % sys.argv[0]) sys.exit(1) src = CatalogueServiceWeb(sys.argv[1]) dest = CatalogueServiceWeb(sys.argv[2]) if len(sys.argv) == 4: maxrecords = sys.argv[3] while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results["nextrecord"] src.getrecords(esn="brief", startposition=startposition, maxrecords=maxrecords) print(src.results) if ( src.results["nextrecord"] == 0 or src.results["returned"] == 0 or src.results["nextrecord"] > src.results["matches"] ): # end the loop, exhausted all records stop = 1 break # harvest each record to destination CSW for i in list(src.records): source = "%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s" % (sys.argv[1], i) dest.harvest(source=source, resourcetype="http://www.isotc211.org/2005/gmd")
from owslib.csw import CatalogueServiceWeb csw = CatalogueServiceWeb( 'http://www.metadados.geo.ibge.gov.br/geonetwork_ibge/srv/por/csw') csw.identification.type 'CSW' [op.name for op in csw.operations] csw.getdomain('GetRecords.resultType') csw.getrecords(keywords=[('bcim')], maxrecords=10000) csw.results for rec in csw.records: print csw.records[rec].title
if schema == 'iso': outputschema = 'http://www.isotc211.org/2005/gmd' # init c = CatalogueServiceWeb(url, lang, version) if request == 'GetCapabilities': pass elif request == 'DescribeRecord': c.describerecord(typename) elif request == 'GetRecordById': c.getrecordbyid([id]) elif request == 'GetDomain': c.getdomain(dname, dtype) elif request == 'GetRecords': c.getrecords(qtype, [keyword], bbox, esn, sortby, schema) if print_request is True: # print the request print c.request if validate is True: # do XML validation print 'Validating request XML' if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True: print 'request is valid XML' else: print 'request is NOT valid XML' # print response print c.response
# In[5]: #This cell examines and provides information on the number of model records # available via each catalog endpoint records1 = [] titles1 = [] lenrecords1 = [] lentitles1 = [] k = 0 for endpoint in endpoints[:3]: csw = CatalogueServiceWeb(endpoint,timeout=60) for model_string in model_strings: try: csw.getrecords(keywords = [model_string], maxrecords = 60, esn = 'full') records1.append(csw.results) except Exception, ex1: records1.append('Error') try: for rec in csw.records: titles1.append(csw.records[rec].title) except Exception, ex1: titles1.append('Error') lentitles1.append(len(titles1[-1])) lenrecords1.append(len(records1[-1])) zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1) df = DataFrame(data = zipvar1, columns = ['endpoints', 'records1','lenrecords1', 'titles1','lentitles1']) df.head()
csw = CatalogueServiceWeb(endpoint) csw.version # <codecell> import iris # <codecell> [op.name for op in csw.operations] # <codecell> bbox=[-141,42,-52,84] #bbox=[-71.5, 39.5, -63.0, 46] csw.getrecords(keywords=['sea_water_temperature'],bbox=bbox,maxrecords=20) #csw.getrecords(keywords=['sea_water_temperature'],maxrecords=20) csw.results # <codecell> for rec,item in csw.records.iteritems(): print rec print item.abstract # <codecell> a=csw.records['data/oceansites/DATA/STATION-M/OS_STATION-M-1_194810_D_CTD.nc'] # <codecell>
% sys.argv[0] sys.exit(1) src = CatalogueServiceWeb(sys.argv[1]) dest = CatalogueServiceWeb(sys.argv[2]) if len(sys.argv) == 4: maxrecords = sys.argv[3] while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords(esn='brief', startposition=startposition, maxrecords=maxrecords) print src.results if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > src.results['matches']: # end the loop, exhausted all records stop = 1 break # harvest each record to destination CSW for i in list(src.records): source = '%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s' % \ (sys.argv[1], i) dest.harvest(source=source, \ resourcetype='http://www.isotc211.org/2005/gmd')
csw = CatalogueServiceWeb(endpoint) csw.version # <codecell> import iris # <codecell> [op.name for op in csw.operations] # <codecell> bbox = [-141, 42, -52, 84] #bbox=[-71.5, 39.5, -63.0, 46] csw.getrecords(keywords=['sea_water_temperature'], bbox=bbox, maxrecords=20) #csw.getrecords(keywords=['sea_water_temperature'],maxrecords=20) csw.results # <codecell> for rec, item in csw.records.iteritems(): print rec print item.abstract # <codecell> a = csw.records[ 'data/oceansites/DATA/STATION-M/OS_STATION-M-1_194810_D_CTD.nc'] # <codecell>
if schema == 'iso': outputschema = 'http://www.isotc211.org/2005/gmd' # init c = CatalogueServiceWeb(url, lang, version) if request == 'GetCapabilities': pass elif request == 'DescribeRecord': c.describerecord(typename) elif request == 'GetRecordById': c.getrecordbyid([id]) elif request == 'GetDomain': c.getdomain(dname, dtype) elif request == 'GetRecords': c.getrecords(qtype, [keyword], bbox, esn, sortby, schema) if print_request is True: # print the request print c.request if validate is True: # do XML validation print 'Validating request XML' if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True: print 'request is valid XML' else: print 'request is NOT valid XML' # print response print c.response
# In[10]: #This cell searches the endpoints using the revised list of variable strings from the previous cell and returns basic information on the number of records and the titles of data sets available via each endpoint variables1 = [] records1 = [] titles1 = [] lenrecords1 = [] lentitles1 = [] for endpoint in endpoints[:3]: csw = CatalogueServiceWeb(endpoint,timeout=60) for v in RecVarsMerg[:2]: try: csw.getrecords(keywords = [v], maxrecords = 60, esn = 'full') records1.append(csw.results) except Exception, ex1: records1.append('Error') try: for rec in csw.records: titles1.append(csw.records[rec].title) except Exception, ex1: titles1.append('Error') lentitles1.append(len(titles1[-1])) lenrecords1.append(len(records1[-1])) zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1) df = DataFrame(data = zipvar1, columns = ['endpoints','records1','lenrecords1', 'titles1','lentitles1']) df.head()
# In[11]: #This cell examines and provides information on the number of model records # available via each catalog endpoint records1 = [] records2 = [] titles1 = [] titles2=[] lenrecords1 = [] lentitles1 = [] for endpoint in endpoints: csw = CatalogueServiceWeb(endpoint,timeout=100) for model_string in model_strings: try: csw.getrecords(keywords = [model_string], maxrecords = 100) records1.append(csw.results) except Exception, ex1: records1.append('Error') try: for rec in csw.records: titles1.append(csw.records[rec].title) except Exception, ex1: titles1.append('Error') lentitles1.append(len(titles1[-1])) lenrecords1.append(len(records1[-1])) zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1) df = DataFrame(data = zipvar1, columns = ['endpoints', 'records1','lenrecords1', 'titles1','lentitles1']) df.head()
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) csw.getrecords(qtype="dataset", outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
args = parser.parse_args() # optional proxy handler for urllib2 and owslib if bool(args.http_proxy): logging.debug('using proxy %s'%args.http_proxy) proxyHandler = urllib2.ProxyHandler({"http" : args.http_proxy}) opener = urllib2.build_opener(proxyHandler) urllib2.install_opener(opener) # csw connection bbox = [float(x) for x in args.extent.split(',')] logging.debug('connecting to %s'%args.csw) csw = CatalogueServiceWeb(args.csw) logging.info('retrieving %s datasets, bbox=%s'%(args.maxrecords, bbox)) csw.getrecords(qtype="dataset", keywords=[], esn="full", maxrecords=args.maxrecords, bbox=bbox) logging.info('retrieved %s records'%len(csw.records)) if len(csw.records)>=args.maxrecords: logging.warn('maxrecords reached') # shapefile preparation shp = shapefile.Writer(shapefile.POLYGON) shp.autoBalance = 1 shp.field('id') shp.field('title') shp.field('layerurl') shp.field('layername') shp.field('layertitle') shp.field('subjects')
class HarvestNode(NgdsDataObject): """Stores information about harvest endpoints""" csw = None def __init__(self, url, **kwargs): # A URL must be given p = urlparse(url) self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path self.frequency = kwargs.get( 'frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly self.title = kwargs.get( 'title', 'No Title Was Given') # A title for bookkeeping self.node_admin_id = kwargs.get( 'node_admin_id', None ) # Foreign Key to a responsible_party who maintains the remote node #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests def setup_csw(self): self.csw = CatalogueServiceWeb(self.url) def do_harvest(self): """Perform a harvest from another CSW server""" if self.csw == None: self.setup_csw() self.get_records() # Do the first GetRecords request ids = self.csw.records.keys() # Start an array to house all of the ids print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) while self.csw.results["nextrecord"] < self.csw.results[ "matches"] and self.csw.results[ "nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything self.get_records( self.csw.results["nextrecord"], self.csw.results["returned"] ) # Get another set, starting from next_record from previous response ids += self.csw.records.keys() # Add new ids to the array print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) self.parse_records(ids) # Gather the records themselves def parse_records(self, ids): """Perform as many GetRecordById requests as needed""" print "Gathered %s IDs" % str(len(ids)) for record_id in ids: self.get_record_by_id(record_id) rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self) def get_record_by_id(self, record_id): """Get a single record, by ID""" params = { "id": [record_id], "outputschema": "http://www.isotc211.org/2005/gmd" } self.csw.getrecordbyid(**params) # Puts response in self.csw.records def get_records(self, start_position=1, max_records=1000): """Perform a GetRecords request""" params = { "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", "startposition": start_position, "maxrecords": max_records, "esn": "brief" } self.csw.getrecords(**params) # Puts results in self.csw.records
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'gl', supplier_title = u"Grand Lyon", target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_site_url = u'http://catalogue.data.grandlyon.com/geosource/srv/fr/csw' if not args.dry_run: harvester.retrieve_target() # Retrieve short infos of packages in source. csw = CatalogueServiceWeb(source_site_url) bad_indexes = [] index = 0 limit = 50 record_by_id = {} while True: try: csw.getrecords(maxrecords = limit, startposition = index) except: if limit == 1: # Bad record found. Skip it. bad_indexes.append(index) index += 1 limit = 50 else: # Retry one by one to find bad record and skip it. limit = 1 else: for id, record in csw.records.iteritems(): record_by_id[id] = record next_index = csw.results['nextrecord'] if next_index <= index: break index = next_index # Retrieve packages from source. formats = set() licenses_url = set() protocols = set() rights = set() temporals = set() types = set() for record_id in record_by_id.iterkeys(): csw.getrecordbyid(id = [record_id]) dc_record = csw.records[record_id] csw.getrecordbyid(id = [record_id], outputschema = 'http://www.isotc211.org/2005/gmd') gmd_record = csw.records.get(record_id) format = dc_record.format if format is not None: format = format.split(u' (', 1)[0] formats.add(format) copyright = dc_record.rights if copyright and isinstance(copyright, list): copyright = tuple(copyright) rights.add(copyright) if gmd_record is None: frequency = None else: for frequency_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo' '/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation' '/gmd:userDefinedMaintenanceFrequency/gts:TM_PeriodDuration', namespaces = namespaces): frequency = frequency_xml.text break else: frequency = None if frequency is not None: assert frequency in frequency_by_code, 'Unknown frequency: {}'.format(frequency) frequency = frequency_by_code[frequency] for uri in dc_record.uris: if uri['url'].startswith('http://opendata.data.grandlyon.com/Licence'): licenses_url.add(uri['url']) protocols.add(uri['protocol']) subjects = [ subject for subject in dc_record.subjects if subject != 'OpenData' ] groups = [ harvester.upsert_group(dict( title = subjects[0], )), ] if subjects else [] groups.append(harvester.upsert_group(dict( title = u'Territoires et Transports', ))) tags = [ dict(name = strings.slugify(subject)) for subject in subjects ] related = [] if gmd_record is None: resources = [ dict( description = uri.get('description') or None, format = { 'application/pdf': 'PDF', 'application/zip': 'ZIP', 'pdf': 'PDF', 'text/csv': 'CSV', 'text/plain': 'TXT', }.get(format, format), name = uri.get('name') or None, url = uri['url'], ) for uri in dc_record.uris if uri.get('protocol') in ('WWW:DOWNLOAD-1.0-http--download', 'WWW:LINK-1.0-http--link') and uri['url'].startswith('http://opendata.data.grandlyon.com/') and uri['url'] != 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf' ] else: kml_resource = False resources = [] for online in gmd_record.distribution.online: if online.url.startswith(( 'http://catalogue.data.grandlyon.com/geosource/srv/en/resources.get?id=', 'file:', 'jdbc:', )) \ or online.url == 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf': continue if online.protocol == 'OGC:WFS': if not kml_resource: resources.append(dict( description = online.description or None, format = 'KML', name = online.name or None, url = 'http://kml.data.grandlyon.com/grandlyon/?request=list&typename={}'.format( online.name), )) kml_resource = True if '?' not in online.url: resources.append(dict( description = online.description or None, format = 'GML', name = online.name or None, url = u'{}?SERVICE={}&REQUEST=GetFeature&VERSION=1.1.0&typename={}'.format(online.url, online.protocol.split(':', 1)[1], online.name), )) elif online.protocol == 'OGC:WMS': if '?' not in online.url: bounding_box = gmd_record.identification.extent.boundingBox related.append(dict( image_url = u'{}?SERVICE={}&REQUEST=GetMap&VERSION=1.1.1&LAYERS={}&FORMAT=image/png' u'&SRS=EPSG:4326&BBOX={},{},{},{}&WIDTH=400&HEIGHT=300'.format(online.url, online.protocol.split(':', 1)[1], online.name, bounding_box.minx, bounding_box.miny, bounding_box.maxx, bounding_box.maxy), title = u'Vignette', type = u'visualization', # url = None, )) resources.append(dict( description = online.description or None, format = { 'DB:POSTGIS': 'POSTGIS', 'FILE:RASTER': 'RASTER', 'OGC:WCS': 'WCS', 'OGC:WFS': 'WFS', 'OGC:WMS': 'WMS', 'WWW:DOWNLOAD-1.0-http--download': None, 'WWW:LINK-1.0-http--link': None, }[online.protocol], name = online.name or None, url = online.url, )) temporals.add(dc_record.temporal) types.add(dc_record.type) if args.dry_run: log.info(u'Harvested package: {}'.format(dc_record.title)) else: package = dict( frequency = { 'P0Y0M0DT0H1M0S': u"ponctuelle", }.get(frequency), license_id = { 'copyright': None, ('Licence ODbL GRAND LYON', u"Pas de restriction d'accès public"): u'odc-odbl', 'license': None, }.get(copyright), notes = u'\n\n'.join( fragment for fragment in ( dc_record.abstract, dc_record.source, ) if fragment ), resources = resources, tags = [ dict(name = strings.slugify(subject)) for subject in dc_record.subjects ], # territorial_coverage = TODO title = dc_record.title, # TODO: Use this URL once Grand Lyon is ready to use it. Before end of year. # url = u'http://smartdata.grandlyon.com/single/{}'.format(record_id), url = u'http://smartdata.grandlyon.com/', ) # if gmd_record is not None: # for graphic_filename_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo' # '/gmd:MD_DataIdentification/gmd:graphicOverview' # '/gmd:MD_BrowseGraphic[gmd:fileDescription/gco:CharacterString="large_thumbnail"]' # '/gmd:fileName/gco:CharacterString', # namespaces = namespaces): # related.append(dict( # image_url = urlparse.urljoin(base_url, unicode(graphic_filename_xml.text)), # title = u'Vignette', # type = u'visualization', # # url = TODO, # )) log.info(u'Harvested package: {}'.format(package['title'])) harvester.add_package(package, harvester.supplier, dc_record.title, package['url'], related = related or None) if not args.dry_run: harvester.update_target() log.info(u'Formats: {}'.format(sorted(formats))) log.info(u'Licenses: {}'.format(sorted(licenses_url))) log.info(u'Protocols: {}'.format(sorted(protocols))) log.info(u'Rights: {}'.format(sorted(rights))) log.info(u'Temporals: {}'.format(sorted(temporals))) log.info(u'Types: {}'.format(sorted(types))) return 0