class GeocatCatalogueServiceWeb(object): def __init__(self, url): self.csw = CatalogueServiceWeb(url) self.schema = CHE_SCHEMA def get_geocat_id_from_csw(self, cqlquery=CQL_QUERY_DEFAULT, cqlterm=CQL_SEARCH_TERM_DEFAULT): harvest_query = PropertyIsEqualTo(cqlquery, cqlterm) nextrecord = 0 record_ids = [] while nextrecord is not None: self.csw.getrecords2(constraints=[harvest_query], maxrecords=50, startposition=nextrecord) if self.csw.response is None or self.csw.results['matches'] == 0: raise CswNotFoundError( "No dataset found for harvest query {}".format( harvest_query)) if self.csw.results['returned'] > 0: if self.csw.results['nextrecord'] > 0: nextrecord = self.csw.results['nextrecord'] else: nextrecord = None for id in self.csw.records.keys(): record_ids.append(id) return record_ids def get_record_by_id(self, geocat_id): self.csw.getrecordbyid(id=[geocat_id], outputschema=self.schema) csw_record_as_string = self.csw.response if csw_record_as_string: return csw_record_as_string else: return None
class HarvestNode(NgdsDataObject): """Stores information about harvest endpoints""" csw = None def __init__(self, url, **kwargs): # A URL must be given p = urlparse(url) self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path self.frequency = kwargs.get('frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly self.title = kwargs.get('title', 'No Title Was Given') # A title for bookkeeping self.node_admin_id = kwargs.get('node_admin_id', None) # Foreign Key to a responsible_party who maintains the remote node #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests def setup_csw(self): self.csw = CatalogueServiceWeb(self.url) def do_harvest(self): """Perform a harvest from another CSW server""" if self.csw == None: self.setup_csw() self.get_records() # Do the first GetRecords request ids = self.csw.records.keys() # Start an array to house all of the ids print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) while self.csw.results["nextrecord"] < self.csw.results["matches"] and self.csw.results["nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything self.get_records(self.csw.results["nextrecord"], self.csw.results["returned"]) # Get another set, starting from next_record from previous response ids += self.csw.records.keys() # Add new ids to the array print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) self.parse_records(ids) # Gather the records themselves def parse_records(self, ids): """Perform as many GetRecordById requests as needed""" print "Gathered %s IDs" % str(len(ids)) for record_id in ids: self.get_record_by_id(record_id) rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self) def get_record_by_id(self, record_id): """Get a single record, by ID""" params = { "id": [ record_id ], "outputschema": "http://www.isotc211.org/2005/gmd" } self.csw.getrecordbyid(**params) # Puts response in self.csw.records def get_records(self, start_position=1, max_records=1000): """Perform a GetRecords request""" params = { "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", "startposition": start_position, "maxrecords": max_records, "esn": "brief" } self.csw.getrecords(**params) # Puts results in self.csw.records
def show_metadata(self): """show record metadata""" if not self.treeRecords.selectedItems(): return item = self.treeRecords.currentItem() if not item: return identifier = get_item_data(item, 'identifier') self.disable_ssl_verification = self.disableSSLVerification.isChecked() auth = None if self.disable_ssl_verification: try: auth = Authentication(verify=False) except NameError: pass try: with OverrideCursor(Qt.WaitCursor): cat = CatalogueServiceWeb( self.catalog_url, timeout=self.timeout, # spellok username=self.catalog_username, password=self.catalog_password, auth=auth) cat.getrecordbyid( [self.catalog.records[identifier].identifier]) except ExceptionReport as err: QMessageBox.warning( self, self.tr('GetRecords error'), self.tr('Error getting response: {0}').format(err)) return except KeyError as err: QMessageBox.warning(self, self.tr('Record parsing error'), self.tr('Unable to locate record identifier')) return record = cat.records[identifier] record.xml_url = cat.request crd = RecordDialog() metadata = render_template('en', self.context, record, 'record_metadata_dc.html') style = QgsApplication.reportStyleSheet() crd.textMetadata.document().setDefaultStyleSheet(style) crd.textMetadata.setHtml(metadata) crd.exec_()
def test_GetRecordById(self): csw = CatalogueServiceWeb(service) tofetch = identifiers[:2] csw.getrecordbyid(tofetch, outputschema=GMD) nrecords = len(csw.records) assert nrecords == len(tofetch), nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident]) csw.getrecordbyid(["nonexistent"], outputschema=GMD) nrecords = len(csw.records) assert nrecords == 0, nrecords
def test_GetRecordById(self): csw = CatalogueServiceWeb(service) tofetch = identifiers[:2] csw.getrecordbyid(tofetch, outputschema=GMD) nrecords = len(csw.records) assert nrecords == len(tofetch), nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident]) csw.getrecordbyid(["nonexistent"], outputschema=GMD) nrecords = len(csw.records) assert nrecords == 0, nrecords
def get_csw_record_by_id(self, csw_url, identifier): ''' Function to return OWSLib CSW record record from specified CSW URL using UUID as the search criterion ''' csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url csw.getrecordbyid(id=[identifier], esn='full', outputschema='own') # Ensure there is exactly one record found assert len( csw.records) > 0, 'No CSW records found for ID "%s"' % identifier assert len( csw.records) == 1, 'Multiple CSW records found for ID "%s"' % identifier return csw.records.values()[0]
def get_csw_record_by_id(self, csw_url, identifier): ''' Function to return OWSLib CSW record record from specified CSW URL using UUID as the search criterion ''' csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url csw.getrecordbyid(id=[identifier], esn='full', outputschema='own') # Ensure there is exactly one record found assert len( csw.records) > 0, 'No CSW records found for ID "%s"' % identifier assert len( csw.records) == 1, 'Multiple CSW records found for ID "%s"' % identifier return csw.records.values()[0]
def show_metadata(self): """show record metadata""" if not self.treeRecords.selectedItems(): return item = self.treeRecords.currentItem() if not item: return identifier = get_item_data(item, 'identifier') try: QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) cat = CatalogueServiceWeb(self.catalog_url, timeout=self.timeout, username=self.catalog_username, password=self.catalog_password) cat.getrecordbyid([self.catalog.records[identifier].identifier]) except ExceptionReport as err: QApplication.restoreOverrideCursor() QMessageBox.warning( self, self.tr('GetRecords error'), self.tr('Error getting response: {0}').format(err)) return except KeyError as err: QMessageBox.warning(self, self.tr('Record parsing error'), self.tr('Unable to locate record identifier')) QApplication.restoreOverrideCursor() return QApplication.restoreOverrideCursor() record = cat.records[identifier] record.xml_url = cat.request crd = RecordDialog() metadata = render_template('en', self.context, record, 'record_metadata_dc.html') style = QgsApplication.reportStyleSheet() crd.textMetadata.document().setDefaultStyleSheet(style) crd.textMetadata.setHtml(metadata) crd.exec_()
def show_metadata(self): """show record metadata""" if not self.treeRecords.selectedItems(): return item = self.treeRecords.currentItem() if not item: return identifier = get_item_data(item, 'identifier') try: QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) cat = CatalogueServiceWeb(self.catalog_url, timeout=self.timeout, username=self.catalog_username, password=self.catalog_password) cat.getrecordbyid( [self.catalog.records[identifier].identifier]) except ExceptionReport as err: QApplication.restoreOverrideCursor() QMessageBox.warning(self, self.tr('GetRecords error'), self.tr('Error getting response: {0}').format(err)) return except KeyError as err: QMessageBox.warning(self, self.tr('Record parsing error'), self.tr('Unable to locate record identifier')) QApplication.restoreOverrideCursor() return QApplication.restoreOverrideCursor() record = cat.records[identifier] record.xml_url = cat.request crd = RecordDialog() metadata = render_template('en', self.context, record, 'record_metadata_dc.html') style = QgsApplication.reportStyleSheet() crd.textMetadata.document().setDefaultStyleSheet(style) crd.textMetadata.setHtml(metadata) crd.exec_()
def get_service_url(result): md_id = result["mdId"] csw = CatalogueServiceWeb(CSW_URL) csw.getrecordbyid(id=[md_id]) record = csw.records[md_id] uris = record.uris service_url = "" if len(uris) > 0: service_url = uris[0]["url"] service_url = service_url.split("?")[0] service_str = result["protocol"].split(":")[1] if "https://geodata.nationaalgeoregister.nl/tiles/service/wmts" in service_url: service_url = "https://geodata.nationaalgeoregister.nl/tiles/service/wmts" service_url = f"{service_url}?request=GetCapabilities&service={service_str}" else: error_message = ( f"expected at least 1 service url in service record {md_id}, found 0" ) logging.error(error_message) return {"mdId": md_id, "url": service_url}
def harvest_csw(src_url, dest_url): stop = 0 flag = 0 maxrecords = 10 src = CatalogueServiceWeb(src_url) dest = CatalogueServiceWeb(dest_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords2(esn='full', startposition=startposition, maxrecords=maxrecords) print(src.results) if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > src.results['matches']: # end the loop, exhausted all records stop = 1 break # harvest each record to destination CSW for i in list(src.records): print "insert", i src.getrecordbyid(id=[i], outputschema='http://www.isotc211.org/2005/gmd') md = src._exml.find('{http://www.isotc211.org/2005/gmd}MD_Metadata') f = open('/tmp/a.xml', 'w') f.write(etree.tostring(md)) f.close() dest.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("/tmp/a.xml").read()) flag = 1
def get_record_by_id(mdId): csw = CatalogueServiceWeb(CSW_URL) csw.getrecordbyid(id=[mdId]) return csw.records[mdId]
class CSW202Search(SearchBase): def __init__(self, url, timeout, username, password, auth): super().__init__(url, timeout, username, password, auth) self.type = CATALOG_TYPES[0] self.format = 'xml' self.service_info_template = 'csw_service_metadata.html' self.record_info_template = 'record_metadata_dc.html' self.constraints = [] self.conn = CatalogueServiceWeb( self.url, # spellok timeout=self.timeout, username=self.username, password=self.password, auth=self.auth) self.request = self.conn.request self.response = self.conn.response def query_records(self, bbox=[], keywords=None, limit=10, offset=1): self.constraints = [] # only apply spatial filter if bbox is not global # even for a global bbox, if a spatial filter is applied, then # the CSW server will skip records without a bbox if bbox and bbox != ['-180', '-90', '180', '90']: minx, miny, maxx, maxy = bbox self.constraints.append( BBox([miny, minx, maxy, maxx], crs='urn:ogc:def:crs:EPSG::4326')) # keywords if keywords: # TODO: handle multiple word searches self.constraints.append(PropertyIsLike('csw:AnyText', keywords)) if len(self.constraints) > 1: # exclusive search (a && b) self.constraints = [self.constraints] self.conn.getrecords2(constraints=self.constraints, maxrecords=limit, startposition=offset, esn='full') self.matches = self.conn.results['matches'] self.returned = self.conn.results['returned'] self.request = self.conn.request self.response = self.conn.response def records(self): recs = [] for record in self.conn.records: rec = { 'identifier': None, 'type': None, 'title': None, 'bbox': None } if self.conn.records[record].identifier: rec['identifier'] = self.conn.records[record].identifier if self.conn.records[record].type: rec['type'] = self.conn.records[record].type if self.conn.records[record].title: rec['title'] = self.conn.records[record].title if self.conn.records[record].bbox: rec['bbox'] = bbox_list_to_dict(self.conn.records[record].bbox) rec['links'] = (self.conn.records[record].uris + self.conn.records[record].references) recs.append(rec) return recs def get_record(self, identifier): self.conn.getrecordbyid([identifier]) return self.conn.records[identifier]
# Search for bird data in Canada bbox_query = BBox([-141, 42, -52, 84]) csw.getrecords2(constraints=[birds_query, bbox_query]) csw.results # Search for keywords like ‘birds’ or ‘fowl’ birds_query_like = PropertyIsLike('dc:subject', '%birds%') fowl_query_like = PropertyIsLike('dc:subject', '%fowl%') csw.getrecords2(constraints=[birds_query_like, fowl_query_like]) csw.results # Search for a specific record: csw.getrecordbyid(id=['9250AA67-F3AC-6C12-0CB9-0662231AA181']) c.records['9250AA67-F3AC-6C12-0CB9-0662231AA181'].title # Search with a CQL query csw.getrecords(cql='csw:AnyText like "%birds%"') csw.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("file.xml").read()) # update ALL records csw.transaction(ttype='update', typename='csw:Record', propertyname='dc:title', propertyvalue='New Title')
class CswBaseHandler(object): def __init__(self, url, username=None, password=None): self.url = url self.username = username self.password = password try: self.remote = CatalogueServiceWeb(self.url, timeout=3600, lang='fr-FR', version='2.0.2', skip_caps=True, username=self.username, password=self.password) except Exception: raise CswReadError() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): # Fake logger.info('Close CSW connection') @CswExceptionsHandler() def get_packages(self, *args, **kwargs): self.remote.getrecords2(**kwargs) records = self.remote.records.copy() res = [] for k in list(records.keys()): try: package = self.get_package(k) except CswBaseError as e: logger.warning(e) else: res.append(package) return res @CswExceptionsHandler() def get_package(self, id, *args, **kwargs): self.remote.getrecordbyid( [id], outputschema='http://www.isotc211.org/2005/gmd') records = self.remote.records.copy() rec = records[id] xml = rec.xml if not rec.__class__.__name__ == 'MD_Metadata': raise CswBaseError('outputschema error') # if not (rec.stdname == 'ISO 19115:2003/19139' and rec.stdver == '1.0'): # raise CswBaseError('outputschema error: stdname:{} stdver:{}'.format(rec.stdname, rec.stdver)) if rec.hierarchy and not rec.hierarchy == 'dataset': # 7218 raise CswBaseError( 'MD {id} is not a Dataset'.format(id=rec.identifier)) # _encoding = rec.charset id = rec.identifier title = rec.identification.title name = slugify(title) notes = description = rec.identification.abstract thumbnail = None keywords, themes = [], [] for item in rec.identification.keywords2: if not item.__class__.__name__ == 'MD_Keywords': continue if item.type == 'theme': themes += item.keywords keywords += item.keywords tags = [] for keyword in keywords: if not keyword: continue keyword_match = re.compile('[\w\s\-.]*$', re.UNICODE) if keyword_match.match(keyword): tags.append({'display_name': keyword}) groups = [{'name': name} for name in rec.identification.topiccategory] if themes: groups += [{'name': name} for name in themes] dataset_creation_date = None dataset_modification_date = None dataset_publication_date = None if rec.identification.date: for item in rec.identification.date: if not item.__class__.__name__ == 'CI_Date': continue if item.type == 'creation': dataset_creation_date = item.date elif item.type == 'publication': dataset_publication_date = item.date elif item.type in ('modification', 'revision'): dataset_modification_date = item.date frequency = None geocover = None granularity = None organisation = { 'id': None, 'name': None, 'title': None, 'description': None, 'created': None, 'is_organization': True, 'state': 'active', 'image_url': None, 'type': 'organization', 'approval_status': 'approved', } license_titles = rec.identification.uselimitation or [] support = None data_type = None author = None author_email = None maintainer = None maintainer_email = None bbox = None spatial = None if hasattr(rec.identification, 'bbox'): xmin = rec.identification.bbox.minx ymin = rec.identification.bbox.miny xmax = rec.identification.bbox.maxx ymax = rec.identification.bbox.maxy bbox = transform(bounds_to_wkt(xmin, ymin, xmax, ymax), '4326') spatial = { 'type': 'Polygon', 'coordinates': [[[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax], [xmin, ymin]]] } resources = [] for item in rec.distribution.online: name = hasattr(item, 'name') and item.name or '' description = hasattr(item, 'description') and item.description or '' protocol = hasattr(item, 'protocol') and item.protocol or '' mimetype = hasattr(item, 'mimetype') and item.mimetype or '' url = hasattr(item, 'url') and item.url or '' resource = { 'name': name, 'description': description, 'protocol': protocol, 'mimetype': mimetype, 'url': url, } resources.append(resource) return { 'state': 'active', 'type': 'dataset', 'id': id, 'name': name, 'title': title, 'notes': notes, 'thumbnail': thumbnail, 'num_tags': len(tags), 'tags': tags, 'groups': groups, 'metadata_created': dataset_creation_date, 'metadata_modified': dataset_modification_date, 'dataset_creation_date': dataset_creation_date, 'dataset_modification_date': dataset_modification_date, 'dataset_publication_date': dataset_publication_date, 'frequency': frequency, 'geocover': geocover, 'granularity': granularity, 'organization': organisation, 'license_titles': license_titles, 'support': support, 'datatype': data_type, 'author': author, 'author_email': author_email, 'maintainer': maintainer, 'maintainer_email': maintainer_email, 'num_resources': len(resources), 'resources': resources, 'spatial': spatial, 'bbox': bbox, 'xml': xml, }
class CSWRepository(HarvestRepository): """ CSW Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "csw" super(CSWRepository, self).setRepoParams(repoParams) try: self.cswrepo = CatalogueServiceWeb(self.url) except: self.cswrepo = None self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "csw", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url, "repo_oai_name": self.repo_oai_name } self.repository_id = self.db.update_repo(**kwargs) if self.cswrepo is None: self.logger.error("Could not initiate this repo to crawl it") return item_count = 0 while True: try: self.cswrepo.getrecords2( startposition=self.cswrepo.results['nextrecord']) except: self.cswrepo.getrecords2() for rec in self.cswrepo.records: result = self.db.write_header( self.cswrepo.records[rec].identifier, self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info( "Done {} item headers after {} ({:.1f} items/sec)". format(item_count, self.formatter.humanize(tdelta), item_count / tdelta)) if item_count >= self.cswrepo.results['matches']: break self.logger.info("Found {} items in feed".format(item_count)) def format_csw_to_oai(self, csw_record, local_identifier): record = {} if csw_record.language == "eng": record["title"] = csw_record.title record["title"] = record["title"].strip() record["title_fr"] = "" record["tags"] = csw_record.subjects elif csw_record.language == "fre": record["title_fr"] = csw_record.title record["title_fr"] = record["title_fr"].strip() record["title"] = "" record["tags_fr"] = csw_record.subjects record["description"] = csw_record.abstract record["identifier"] = local_identifier record["creator"] = self.name record["series"] = "" if csw_record.bbox: if float(csw_record.bbox.minx) > float(csw_record.bbox.maxx): # longitude values (minx and maxx) are switched by oswlib; switch them back record["geobboxes"] = [{ "southLat": csw_record.bbox.miny, "westLon": csw_record.bbox.maxx, "northLat": csw_record.bbox.maxy, "eastLon": csw_record.bbox.minx }] elif float(csw_record.bbox.miny) > float(csw_record.bbox.maxy): # sometimes x and y values are switched, so the lats are longs and vice versa # we can look for the same discrepancy that happens in the longs, except it's in the y values now record["geobboxes"] = [{ "southLat": csw_record.bbox.minx, "westLon": csw_record.bbox.maxy, "northLat": csw_record.bbox.maxx, "eastLon": csw_record.bbox.miny }] else: # default if nothing is wrong (this code isn't executed currently) record["geobboxes"] = [{ "southLat": csw_record.bbox.miny, "westLon": csw_record.bbox.minx, "northLat": csw_record.bbox.maxy, "eastLon": csw_record.bbox.maxx }] return record @rate_limited(5) def _update_record(self, record): if self.cswrepo is None: return try: self.cswrepo.getrecordbyid(id=[record['local_identifier']]) except: self.logger.error("Unable to update record: {}".format( record['local_identifier'])) self.db.delete_record(record) return False if self.cswrepo.records: csw_record = self.cswrepo.records[record['local_identifier']] oai_record = self.format_csw_to_oai(csw_record, record['local_identifier']) # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific self.cswrepo.getrecordbyid( id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd") oai_record["pub_date"] = self.cswrepo.records[ record['local_identifier']].datestamp oai_record["pub_date"] = re.sub( "[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "", oai_record["pub_date"]) if oai_record: try: self.db.write_record(oai_record, self) except: if self.dump_on_failure == True: try: print(csw_record) except: pass return True else: # This record was deleted self.db.delete_record(record) return True return False
if request is None or url is None: usage() sys.exit(3) if schema == 'iso': outputschema = 'http://www.isotc211.org/2005/gmd' # init c = CatalogueServiceWeb(url, lang, version) if request == 'GetCapabilities': pass elif request == 'DescribeRecord': c.describerecord(typename) elif request == 'GetRecordById': c.getrecordbyid([id]) elif request == 'GetDomain': c.getdomain(dname, dtype) elif request == 'GetRecords': c.getrecords(qtype, [keyword], bbox, esn, sortby, schema) if print_request is True: # print the request print c.request if validate is True: # do XML validation print 'Validating request XML' if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True: print 'request is valid XML' else: print 'request is NOT valid XML'
class CkanMetadata(object): """ Provides general access to CSW for CKAN """ def __init__(self, url, schema, version='2.0.2', lang='en-US'): self.schema = schema self.catalog = CatalogueServiceWeb( url, lang, version, timeout=10, skip_caps=True ) self.metadata = dict.fromkeys([ 'id', 'name', 'title', 'url', 'author', 'maintainer', 'maintainer_email', 'license_url', 'version', 'service_url', 'service_type', 'notes', 'tags', 'metadata_url', 'metadata_raw', ]) def get_by_search(self, searchterm, propertyname='csw:AnyText'): """ Returns the found csw dataset with the given searchterm """ self.catalog.getrecords( keywords=[searchterm], propertyname=propertyname ) if (self.catalog.response is None or self.catalog.results['matches'] == 0): raise DatasetNotFoundError( "No dataset for the given searchterm '%s' (%s) found" % (searchterm, propertyname) ) return self.catalog.records def get_by_id(self, id): """ Returns the csw dataset with the given id """ self.catalog.getrecordbyid(id=[id], outputschema=self.schema) return self.catalog.response def get_id_by_dataset_name(self, dataset_name): """ Returns the id of a dataset identified by it's name. If there are multiple datasets with the given name, only the id of the first one is returned. """ dataset_list = self.get_by_search(dataset_name, 'title') return dataset_list.itervalues().next().identifier def get_attribute(self, ckan_attribute, dataset_name=None): """ Abstract method to define the mapping of a ckan attribute to a csw attribute """ raise NotImplementedError def get_xml(self, id): dataset_xml_string = self.get_by_id(id) if dataset_xml_string is None: raise DatasetNotFoundError("Dataset with id %s not found" % id) return dataset_xml_string def get_ckan_metadata_by_id(self, id, language='de'): log.debug("Dataset ID: %s" % id) dataset_xml = etree.fromstring(self.get_xml(id)) for key in self.metadata: log.debug("Metadata key: %s" % key) attribute = self.get_attribute(key) self.metadata[key] = attribute.get_value( xml=dataset_xml, lang=language ) return self.metadata def get_ckan_metadata(self, dataset_name, language='de'): """ Returns the requested dataset mapped to CKAN attributes """ id = self.get_id_by_dataset_name(dataset_name) return self.get_ckan_metadata_by_id(id, language)
class CSWSource: """ A CSW Harvest Source """ csw = None csw_info = {} errors = [] datasets = [] # all datasets included validation_errors = [] duplicates = [] # list of datasets with the same identifier def __init__(self, url): self.url = url def get_cleaned_url(self): # remove all URL params parts = urlparse(self.url) return urlunparse( (parts.scheme, parts.netloc, parts.path, None, None, None)) def connect_csw(self, clean_url=True, timeout=120): # connect to csw source url = self.get_cleaned_url() if clean_url else self.url try: self.csw = CatalogueServiceWeb(url, timeout=timeout) except Exception as e: error = f'Error connection CSW: {e}' self.errors.append(error) return False self.read_csw_info() return True def as_json(self): self.read_csw_info() return self.csw_info def get_records(self, page=10, outputschema='gmd', esn='brief'): # iterate pages to get all records self.csw_info['records'] = {} self.csw_info['pages'] = 0 # TODO get filters fom harvest source # https://github.com/GSA/ckanext-spatial/blob/datagov/ckanext/spatial/harvesters/csw.py#L90 cql = None # output schema # outputschema: the outputSchema (default is 'http://www.opengis.net/cat/csw/2.0.2') # "csw" at GeoDataGovGeoportalHarvester # "gmd" at CSWHarvester # outputschema = 'gmd' # https://github.com/geopython/OWSLib/blob/master/owslib/csw.py#L551 startposition = 0 kwa = { "constraints": [], "typenames": 'csw:Record', "esn": esn, # esn: the ElementSetName 'full', 'brief' or 'summary' (default is 'full') "startposition": startposition, "maxrecords": page, "outputschema": namespaces[outputschema], "cql": cql, } matches = 0 self.csw_info['records'] = {} while True: try: self.csw.getrecords2(**kwa) except Exception as e: error = f'Error getting records(2): {e}' self.errors.append(error) break if self.csw.exceptionreport: exceptions = self.csw.exceptionreport.exceptions error = 'Error getting records: {}'.format(exceptions) self.errors.append(error) # raise Exception(error) break self.csw_info['pages'] += 1 if matches == 0: matches = self.csw.results['matches'] records = self.csw.records.items() for record in records: key, csw_record = record if outputschema == 'gmd': # it's a MD_Metadata object # https://github.com/geopython/OWSLib/blob/3338340e6a9c19dd3388240815d35d60a0d0cf4c/owslib/iso.py#L31 value = self.md_metadata_to_dict(csw_record) elif outputschema == 'csw': # it's a CSWRecord raise Exception('Not using CSWRecords') value['esn'] = esn self.csw_info['records'][key] = value yield value if len(records) == 0: break startposition += page if startposition > matches: break kwa["startposition"] = startposition self.csw_info['total_records'] = len(self.csw_info['records'].keys()) def get_record(self, identifier, esn='full', outputschema='gmd'): # Get Full record info try: records = self.csw.getrecordbyid( [identifier], outputschema=namespaces[outputschema]) except ExceptionReport as e: self.errors.append(f'Error getting record {e}') # 'Invalid parameter value: locator=outputSchema' is an XML error return None csw_record = self.csw.records[identifier] dict_csw_record = self.md_metadata_to_dict(csw_record) record = self.csw_info['records'].get(identifier, {}) record.update(dict_csw_record) record['esn'] = esn record['outputschema'] = outputschema self.csw_info['records'][identifier] = record return record def md_metadata_to_dict(self, mdm): # analize an md_metadata object ret = {} # ret['xml'] = mdm.xml it's a bytes type ret['identifier'] = mdm.identifier ret['parentidentifier'] = mdm.parentidentifier ret['language'] = mdm.language ret['dataseturi'] = mdm.dataseturi ret['languagecode'] = mdm.languagecode ret['datestamp'] = mdm.datestamp ret['charset'] = mdm.charset ret['hierarchy'] = mdm.hierarchy ret['contact'] = [] for ctc in mdm.contact: contact = { 'name': ctc.name, 'organization': ctc.organization, 'city': ctc.city, 'email': ctc.email, 'country': ctc.country } ret['contact'].append(contact) ret['datetimestamp'] = mdm.datetimestamp ret['stdname'] = mdm.stdname ret['stdver'] = mdm.stdver ret['locales'] = [] for lo in mdm.locales: ret['locales'].append({ 'id': lo.id, 'languagecode': lo.languagecode, 'charset': lo.charset }) # ret['referencesystem'] = mdm.referencesystem # this two will be reemplaced by "identificationinfo" # ret['identification'] = mdm.identification # ret['serviceidentification'] = mdm.serviceidentification ret['identificationinfo'] = [] for ii in mdm.identificationinfo: iid = { 'title': ii.title, 'abstract': ii.abstract } # there are much more info ret['identificationinfo'].append(iid) ret['contentinfo'] = [] for ci in mdm.contentinfo: cid = {'xml': ci.xml} # there are much more info ret['contentinfo'].append(cid) ret['distribution'] = {} if mdm.distribution is not None: dd = { 'format': mdm.distribution.format, 'version': mdm.distribution.version } # there are much more info ret['distribution'] = dd # TODO ret['dataquality'] = mdm.dataquality return ret def read_csw_info(self): csw_info = {} service = self.csw # Check each service instance conforms to OWSLib interface service.alias = 'CSW' csw_info['version'] = service.version csw_info['identification'] = {} # service.identification csw_info['identification']['type'] = service.identification.type csw_info['identification']['version'] = service.identification.version csw_info['identification']['title'] = service.identification.title csw_info['identification'][ 'abstract'] = service.identification.abstract csw_info['identification'][ 'keywords'] = service.identification.keywords csw_info['identification'][ 'accessconstraints'] = service.identification.accessconstraints csw_info['identification']['fees'] = service.identification.fees csw_info['provider'] = {} csw_info['provider']['name'] = service.provider.name csw_info['provider']['url'] = service.provider.url ctc = service.provider.contact contact = { 'name': ctc.name, 'organization': ctc.organization, 'site': ctc.site, 'instructions': ctc.instructions, 'email': ctc.email, 'country': ctc.country } csw_info['provider']['contact'] = contact csw_info['operations'] = [] for op in service.operations: methods = op.methods for method in methods: if type(method) == dict: constraints = [] for k, v in method.items(): if k == 'constraints': for c in v: if type(c) == dict: constraints.append(c) else: mc = {'name': c.name, 'values': c.values} constraints.append(mc) method['constraints'] = constraints operation = { 'name': op.name, 'formatOptions': op.formatOptions, 'methods': methods } csw_info['operations'].append(operation) self.csw_info.update(csw_info) return self.csw_info def get_original_url(self, harvest_id=None): # take the URL and add required params parts = urlparse(self.url) # urlparse('http://www.cwi.nl:80/%7Eguido/Python.html?q=90&p=881') # ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='q=90&p=881', fragment='') params = { 'SERVICE': 'CSW', 'VERSION': '2.0.2', 'REQUEST': 'GetRecordById', 'OUTPUTSCHEMA': 'http://www.isotc211.org/2005/gmd', 'OUTPUTFORMAT': 'application/xml', } if harvest_id is not None: params['ID'] = harvest_id url = urlunparse((parts.scheme, parts.netloc, parts.path, None, urlencode(params), None)) return url def validate(self): errors = [] # to return list of validation errors # return False, errors return True, None def remove_duplicated_identifiers(self): unique_identifiers = [] for dataset in self.datasets: idf = dataset['identifier'] if idf not in unique_identifiers: unique_identifiers.append(idf) else: self.duplicates.append(idf) self.datasets.remove(dataset) return self.duplicates def count_resources(self): """ read all datasets and count resources """ total = 0 for dataset in self.datasets: pass # TODO return total def save_validation_errors(self, path): dmp = json.dumps(self.validation_errors, indent=2) f = open(path, 'w') f.write(dmp) f.close() def save_duplicates(self, path): dmp = json.dumps(self.duplicates, indent=2) f = open(path, 'w') f.write(dmp) f.close() def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" #FIXME identifier uses incompables characthers as paths (e.g. /). # could exist duplicates paths from different resources # use BASE64 or hashes idf = slugify(dataset['identifier']) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
class CkanMetadata(object): """ Provides general access to CSW for CKAN """ def __init__(self, url, schema, version='2.0.2', lang='en-US'): self.schema = schema self.catalog = CatalogueServiceWeb(url, lang, version, timeout=10, skip_caps=True) self.metadata = dict.fromkeys([ 'id', 'name', 'title', 'url', 'author', 'maintainer', 'maintainer_email', 'license_url', 'version', 'service_url', 'service_type', 'notes', 'tags', 'metadata_url', 'metadata_raw', ]) def get_by_search(self, searchterm, propertyname='csw:AnyText'): """ Returns the found csw dataset with the given searchterm """ self.catalog.getrecords(keywords=[searchterm], propertyname=propertyname) if (self.catalog.response is None or self.catalog.results['matches'] == 0): raise DatasetNotFoundError( "No dataset for the given searchterm '%s' (%s) found" % (searchterm, propertyname)) return self.catalog.records def get_by_id(self, id): """ Returns the csw dataset with the given id """ self.catalog.getrecordbyid(id=[id], outputschema=self.schema) return self.catalog.response def get_id_by_dataset_name(self, dataset_name): """ Returns the id of a dataset identified by it's name. If there are multiple datasets with the given name, only the id of the first one is returned. """ dataset_list = self.get_by_search(dataset_name, 'title') return dataset_list.itervalues().next().identifier def get_attribute(self, ckan_attribute, dataset_name=None): """ Abstract method to define the mapping of a ckan attribute to a csw attribute """ raise NotImplementedError def get_xml(self, id): dataset_xml_string = self.get_by_id(id) if dataset_xml_string is None: raise DatasetNotFoundError("Dataset with id %s not found" % id) return dataset_xml_string def get_ckan_metadata_by_id(self, id, language='de'): log.debug("Dataset ID: %s" % id) dataset_xml = etree.fromstring(self.get_xml(id)) for key in self.metadata: log.debug("Metadata key: %s" % key) attribute = self.get_attribute(key) self.metadata[key] = attribute.get_value(xml=dataset_xml, lang=language) return self.metadata def get_ckan_metadata(self, dataset_name, language='de'): """ Returns the requested dataset mapped to CKAN attributes """ id = self.get_id_by_dataset_name(dataset_name) return self.get_ckan_metadata_by_id(id, language)
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'onm', supplier_title = u"Office national de l'eau et des milieux aquatiques", target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_site_url = u'http://opendata-sie-back.brgm-rec.fr/geosource/srv/eng/csw' # Recette environment if not args.dry_run: harvester.retrieve_target() # Retrieve short infos of packages in source. csw = CatalogueServiceWeb(source_site_url) bad_indexes = [] index = 0 limit = 50 record_by_id = {} while True: try: csw.getrecords(maxrecords = limit, startposition = index) except: if limit == 1: # Bad record found. Skip it. bad_indexes.append(index) index += 1 limit = 50 else: # Retry one by one to find bad record and skip it. limit = 1 else: for id, record in csw.records.iteritems(): record_by_id[id] = record next_index = csw.results['nextrecord'] if next_index <= index: break index = next_index # Retrieve packages from source. formats = set() groups = [ harvester.upsert_group(dict( title = u'Environnement', )), ] temporals = set() types = set() for record_id in record_by_id.iterkeys(): csw.getrecordbyid(id = [record_id]) record = csw.records[record_id] formats.add(record.format) temporals.add(record.temporal) types.add(record.type) if not args.dry_run: package = dict( license_id = u'fr-lo', notes = u'\n\n'.join( fragment for fragment in ( record.abstract, record.source, ) if fragment ), resources = [ dict( description = uri.get('description') or None, format = { 'CSV': 'CSV', 'ESRI Shapefile': 'SHP', 'MIF / MID': 'MIF / MID', # TODO? 'RDF': 'RDF', 'SHP': 'SHP', 'Txt': 'TXT', 'WMS': 'WMS', }.get(record.format, record.format), name = uri.get('name'), url = uri['url'], ) for uri in record.uris ], tags = [ dict(name = strings.slugify(subject)) for subject in record.subjects ], # territorial_coverage = TODO # Datasets have a granularity of either "commune" or "poi". Since the both are indexed the same way, use # "poi". territorial_coverage_granularity = 'poi', title = record.title, # url = u'URL TODO', ) log.info(u'Harvested package: {}'.format(package['title'])) harvester.add_package(package, harvester.supplier, record.title, package['url'], groups = groups) if not args.dry_run: harvester.update_target() log.info(u'Formats: {}'.format(sorted(formats))) log.info(u'Temporals: {}'.format(sorted(temporals))) log.info(u'Types: {}'.format(sorted(types))) return 0
if request is None or url is None: usage() sys.exit(3) if schema == 'iso': outputschema = 'http://www.isotc211.org/2005/gmd' # init c = CatalogueServiceWeb(url, lang, version) if request == 'GetCapabilities': pass elif request == 'DescribeRecord': c.describerecord(typename) elif request == 'GetRecordById': c.getrecordbyid([id]) elif request == 'GetDomain': c.getdomain(dname, dtype) elif request == 'GetRecords': c.getrecords(qtype, [keyword], bbox, esn, sortby, schema) if print_request is True: # print the request print c.request if validate is True: # do XML validation print 'Validating request XML' if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True: print 'request is valid XML' else: print 'request is NOT valid XML'
def get_by_uuid(self, uuid): csw = CatalogueServiceWeb(self.base + "srv/en/csw") csw.getrecordbyid([uuid], outputschema=namespaces["gmd"]) recs = csw.records return recs.values()[0] if len(recs) > 0 else None
class CswScanner: def __init__(self): self.napids = [] self.start_pos = 0 # Get the CSW URL, Username and Password ini_config = ConfigParser() ini_config.read('harvester.ini') csw_url = ini_config.get('csw', 'csw.url') csw_user = ini_config.get('csw', 'csw.username') csw_passwd = ini_config.get('csw', 'csw.password') if csw_user and csw_passwd: self.csw = CatalogueServiceWeb(csw_url, username=csw_user, password=csw_passwd, timeout=20) else: self.csw = CatalogueServiceWeb(csw_url, timeout=20) #### The since date is currently being ignored. def get_all_ids(self, since=None): while True: if since is not None: scan_date = since.strftime('%Y-%m-%d') since_query = PropertyIsGreaterThanOrEqualTo( 'Modified', scan_date) self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata', constraints=[since_query]) else: #self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata') self.csw.getrecords2( xml= '<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord"><csw:Query typeNames="gmd:MD_Metadata"><csw:Constraint version="1.1.0"><Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"/></csw:Constraint></csw:Query></csw:GetRecords>' ) if self.csw.results['returned'] == 0: break print '{0}Found {1}{2}{3} records'.format( Fore.GREEN, Fore.BLUE, self.csw.results['matches'], Fore.GREEN) print '{0}Next record: {1}{2}'.format( Fore.GREEN, Fore.BLUE, self.csw.results['nextrecord']) self.start_pos = self.csw.results['nextrecord'] for rec in self.csw.records: try: print u'{0}{1}{2}: {3}'.format( Fore.RED, rec, Fore.CYAN, self.csw.records[rec].title.decode('utf-8')) except UnicodeEncodeError: print u'{0}Unprintable title for {1}{2}'.format( Fore.GREEN, Fore.RED, rec) self.napids.append(rec) def load_naps(self): ns = Namespaces() gmd = ns.get_namespace('gmd') session = connect_to_database() for napid in self.napids: print '{0}Full NAP Record for {1}{2}'.format( Fore.GREEN, Fore.CYAN, napid) self.csw.getrecordbyid(id=[napid], outputschema=gmd) ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord) if ec_rec is None: ec_rec = ECRecord( uuid=self.csw.records[napid].identifier, title=self.csw.records[napid].identification.title, state='active', nap_record=self.csw.records[napid].xml, csw_scanned=datetime.now().isoformat()) else: ec_rec.title = self.csw.records[napid].identification.title, ec_rec.state = 'active', ec_rec.nap_record = self.csw.records[napid].xml, ec_rec.csw_scanned = datetime.now().isoformat() add_record(session, ec_rec) session.close_all()
class GeonetUserHandler(metaclass=Singleton): def __init__(self): self.username = GEONETWORK_LOGIN self.password = GEONETWORK_PASSWORD self.remote = CatalogueServiceWeb(urljoin(GEONETWORK_URL, 'srv/fre/csw-publication'), timeout=GEONETWORK_TIMEOUT, lang='fr-FR', version='2.0.2', skip_caps=True, username=self.username, password=self.password) def _get(self, url, params): r = requests.get(url, params=params, auth=(self.username, self.password)) r.raise_for_status() return r def _q(self, identifier): r = self._get(urljoin(GEONETWORK_URL, 'srv/fre/q'), { 'uuid': identifier, '_content_type': 'json' }) metadata = r.json().get('metadata') if metadata \ and len(metadata) == 1 \ and metadata[0]['uuid'] == identifier: return metadata[0]['id'] # Sinon error ? def _md_publish(self, identifier): return self._get(urljoin(GEONETWORK_URL, 'srv/fre/md.publish'), {'ids': identifier}) def _transaction(self, ttype, identifier, record=None): return self.remote.transaction(ttype=ttype, typename='gmd:MD_Metadata', identifier=identifier, record=record) def is_record_exists(self, id): return self.get_record(id) and True or False def get_record(self, id): try: self.remote.getrecordbyid( id=[id], outputschema='http://www.isotc211.org/2005/gmd') except requests.exceptions.HTTPError as e: logger.exception(e) if (e.response.status_code == 404): logger.warning("Error 404 was ignored.") return None raise e else: logger.debug("Get MD record with dc:identifier = '%s'" % id) return self.remote.records.get(id) def create_record(self, id, record): logger.debug("Create MD record with dc:identifier = '%s'" % id) return self._transaction('insert', id, record=record) def update_record(self, id, record): logger.debug("Update MD record with dc:identifier = '%s'" % id) return self._transaction('update', id, record=record) def delete_record(self, id): logger.debug("Delete MD record with dc:identifier = '%s'" % id) return self._transaction('delete', id) def publish(self, id): return self._md_publish(self._q(id))
class CSWRepository(HarvestRepository): """ CSW Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "csw" super(CSWRepository, self).setRepoParams(repoParams) self.cswrepo = CatalogueServiceWeb(self.url) self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "csw", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url } self.repository_id = self.db.update_repo(**kwargs) item_count = 0 while True: try: self.cswrepo.getrecords2( startposition=self.cswrepo.results['nextrecord']) except: self.cswrepo.getrecords2() for rec in self.cswrepo.records: result = self.db.write_header( self.cswrepo.records[rec].identifier, self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info( "Done {} item headers after {} ({:.1f} items/sec)". format(item_count, self.formatter.humanize(tdelta), item_count / tdelta)) if item_count == self.cswrepo.results['matches']: break self.logger.info("Found {} items in feed".format(item_count)) def format_csw_to_oai(self, csw_record, local_identifier): record = {} record["title"] = csw_record.title record["description"] = csw_record.abstract record["tags"] = csw_record.subjects record["identifier"] = local_identifier record["creator"] = self.name record["contact"] = self.contact record["series"] = "" return record def _rate_limited(max_per_second): """ Decorator that make functions not be called faster than a set rate """ threading = __import__('threading') lock = threading.Lock() min_interval = 1.0 / float(max_per_second) def decorate(func): last_time_called = [0.0] @wraps(func) def rate_limited_function(*args, **kwargs): lock.acquire() elapsed = time.clock() - last_time_called[0] left_to_wait = min_interval - elapsed if left_to_wait > 0: time.sleep(left_to_wait) lock.release() ret = func(*args, **kwargs) last_time_called[0] = time.clock() return ret return rate_limited_function return decorate @_rate_limited(5) def _update_record(self, record): self.cswrepo.getrecordbyid(id=[record['local_identifier']]) if self.cswrepo.records: csw_record = self.cswrepo.records[record['local_identifier']] oai_record = self.format_csw_to_oai(csw_record, record['local_identifier']) # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific self.cswrepo.getrecordbyid( id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd") oai_record["pub_date"] = self.cswrepo.records[ record['local_identifier']].datestamp oai_record["pub_date"] = re.sub( "[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "", oai_record["pub_date"]) if oai_record: self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata) return True else: # This record was deleted self.db.delete_record(record) return True return False
class GeonetUserHandler(metaclass=Singleton): def __init__(self): self.username = GEONET_USERNAME self.password = GEONET_PASSWORD self.remote = CatalogueServiceWeb(urljoin(GEONET_URL, 'srv/fre/csw-publication'), timeout=GEONET_TIMEOUT, lang='fr-FR', version='2.0.2', skip_caps=True, username=self.username, password=self.password) def _get(self, url, params): r = requests.get(url, params=params, auth=(self.username, self.password)) r.raise_for_status() return r def _q(self, identifier): r = self._get(urljoin(GEONET_URL, 'srv/fre/q'), { 'uuid': identifier, '_content_type': 'json' }) metadata = r.json().get('metadata') if metadata \ and len(metadata) == 1 \ and metadata[0]['uuid'] == identifier: return metadata[0]['id'] # Sinon error ? def _md_publish(self, identifier): return self._get(urljoin(GEONET_URL, 'srv/fre/md.publish'), {'ids': identifier}) def _transaction(self, ttype, identifier, record): params = { 'identifier': identifier, 'record': record, 'ttype': ttype, 'typename': 'gmd:MD_Metadata' } return self.remote.transaction(**params) def is_record_exists(self, id): return self.get_record(id) and True or False def get_record(self, id): self.remote.getrecordbyid( id=[id], outputschema='http://www.isotc211.org/2005/gmd') return self.remote.records.get(id) def create_record(self, id, record): return self._transaction('insert', id, record) def update_record(self, id, record): return self._transaction('update', id, record) # def delete_record(self, id): # return self.remote.transaction('delete', id) def publish(self, id): return self._md_publish(self._q(id))
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages
class CSWSource(HarvesterBaseSource): """ A CSW Harvest Source """ csw = None csw_info = {} def __init__(self, url): super().__init__() self.url = url def get_cleaned_url(self): # remove all URL params parts = urlparse(self.url) return urlunparse( (parts.scheme, parts.netloc, parts.path, None, None, None)) def fetch(self, clean_url=True, timeout=120): # connect to csw source url = self.get_cleaned_url() if clean_url else self.url try: self.csw = CatalogueServiceWeb(url, timeout=timeout) except Exception as e: error = f'Error connection CSW: {e}' self.errors.append(error) logger.error(error) raise self.read_csw_info() def as_json(self): self.read_csw_info() return self.csw_info def get_records(self, page=10, outputschema='gmd', esn='brief'): # iterate pages to get all records self.csw_info['records'] = {} self.csw_info['pages'] = 0 # TODO get filters fom harvest source # https://github.com/GSA/ckanext-spatial/blob/datagov/ckanext/spatial/harvesters/csw.py#L90 cql = None # output schema # outputschema: the outputSchema (default is 'http://www.opengis.net/cat/csw/2.0.2') # "csw" at GeoDataGovGeoportalHarvester # "gmd" at CSWHarvester # outputschema = 'gmd' # https://github.com/geopython/OWSLib/blob/master/owslib/csw.py#L551 startposition = 0 kwa = { "constraints": [], "typenames": 'csw:Record', "esn": esn, # esn: the ElementSetName 'full', 'brief' or 'summary' (default is 'full') "startposition": startposition, "maxrecords": page, "outputschema": namespaces[outputschema], "cql": cql, } matches = 0 self.csw_info['records'] = {} while True: try: self.csw.getrecords2(**kwa) except Exception as e: error = f'Error getting records(2): {e}' self.errors.append(error) break if self.csw.exceptionreport: exceptions = self.csw.exceptionreport.exceptions error = 'Error getting records: {}'.format(exceptions) self.errors.append(error) # raise Exception(error) break self.csw_info['pages'] += 1 if matches == 0: matches = self.csw.results['matches'] records = self.csw.records.items() for record in records: key, csw_record = record if outputschema == 'gmd': # it's a MD_Metadata object # https://github.com/geopython/OWSLib/blob/3338340e6a9c19dd3388240815d35d60a0d0cf4c/owslib/iso.py#L31 value = self.md_metadata_to_dict(csw_record) elif outputschema == 'csw': # it's a CSWResource error = 'Not using CSW schema, we require GMD' value['error'] = error try: value['iso_values'] = self.read_values_from_xml( xml_data=value['content']) except Exception as e: error = f'Error reading ISO values {e}' value['error'] = error raise # Exception(error) value['esn'] = esn self.csw_info['records'][key] = value yield value if len(records) == 0: break startposition += page if startposition > matches: break kwa["startposition"] = startposition self.csw_info['total_records'] = len(self.csw_info['records'].keys()) def get_record(self, identifier, esn='full', outputschema='gmd'): # Get Full record info try: records = self.csw.getrecordbyid( [identifier], outputschema=namespaces[outputschema]) except ExceptionReport as e: self.errors.append(f'Error getting record {e}') # 'Invalid parameter value: locator=outputSchema' is an XML error return None csw_record = self.csw.records[identifier] dict_csw_record = self.md_metadata_to_dict(csw_record) record = self.csw_info['records'].get(identifier, {}) record.update(dict_csw_record) record['esn'] = esn record['outputschema'] = outputschema self.csw_info['records'][identifier] = record return record def read_values_from_xml(self, xml_data): # transform the XML in a dict as ISODocument class # (https://github.com/GSA/ckanext-spatial/blob/2a25f8d60c31add77e155c4136f2c0d4e3b86385/ckanext/spatial/model/harvested_metadata.py#L461) did with its read_values function. iso_parser = ISODocument(xml_str=xml_data) return iso_parser.read_values() def process_xml(self, raw_xml): # get the XML part we need # check samples at /samples folder try: str_xml = raw_xml.decode('utf-8') except Exception as e: error = f'Unable to decode bytes as UTF-8: {e}' raise Exception(error) str_xml = str_xml.replace('\\n', '\n').replace('\\t', '\t') try: mdtree = xet.fromstring(str_xml) except Exception as e: error = f'{e}\n\n - Unable to parse string. \n\n: \t{str_xml[:350]} \n\n' raise Exception(error) # check if root IS what we are looking for needed = [ '{http://www.isotc211.org/2005/gmd}MD_Metadata', '{http://www.isotc211.org/2005/gmi}MI_Metadata' ] if mdtree.tag in needed: gm = mdtree else: # https://docs.python.org/3/library/xml.etree.elementtree.html#parsing-xml-with-namespaces ns = { 'gmd': 'http://www.isotc211.org/2005/gmd', 'gmi': 'http://www.isotc211.org/2005/gmi' } gm1 = mdtree.find('gmd:MD_Metadata', ns) gm2 = mdtree.find('gmi:MI_Metadata', ns) gm = gm1 or gm2 # if we have not a xmlns reference the search fails if gm is None: gm1 = mdtree.find('MD_Metadata') gm2 = mdtree.find('MI_Metadata') if gm is None: # removed undefined tg:"{tg}" error = f'Unable to find MD_Metadata. \n\n: \t{str_xml[:150]} \n\n mdtree.root: {mdtree.tag}' raise Exception(error) try: res = xet.tostring(gm) except Exception as e: error = f'{e}\n\n - gm1:{gm1} gm2:{gm2}\n\n Unable to string. \n\n: \t{str_xml[:150]} \n\n mdtree.root: {mdtree.tag}' raise Exception(error) if type(res) != str: res = res.decode('utf-8') return res def md_metadata_to_dict(self, mdm): # analyze an md_metadata object ret = {} ret['content'] = self.process_xml(raw_xml=mdm.xml) res = '<?xml version="1.0" encoding="UTF-8"?>\n{}'.format( ret['content']) ret['xml'] = res ret['identifier'] = mdm.identifier ret['parentidentifier'] = mdm.parentidentifier ret['language'] = mdm.language ret['dataseturi'] = mdm.dataseturi ret['languagecode'] = mdm.languagecode ret['datestamp'] = mdm.datestamp ret['charset'] = mdm.charset ret['hierarchy'] = mdm.hierarchy ret['contact'] = [] for ctc in mdm.contact: contact = { 'name': ctc.name, 'organization': ctc.organization, 'city': ctc.city, 'email': ctc.email, 'country': ctc.country } ret['contact'].append(contact) ret['datetimestamp'] = mdm.datetimestamp ret['stdname'] = mdm.stdname ret['stdver'] = mdm.stdver ret['locales'] = [] for lo in mdm.locales: ret['locales'].append({ 'id': lo.id, 'languagecode': lo.languagecode, 'charset': lo.charset }) # ret['referencesystem'] = mdm.referencesystem # this two will be reemplaced by "identificationinfo" # ret['identification'] = mdm.identification # ret['serviceidentification'] = mdm.serviceidentification ret['identificationinfo'] = [] for ii in mdm.identificationinfo: iid = { 'title': ii.title, 'abstract': ii.abstract } # there are much more info ret['identificationinfo'].append(iid) ret['contentinfo'] = [] for ci in mdm.contentinfo: cid = {'xml': ci.xml} # there are much more info ret['contentinfo'].append(cid) ret['distribution'] = {} if mdm.distribution is not None: dd = { 'format': mdm.distribution.format, 'version': mdm.distribution.version } # there are much more info ret['distribution'] = dd # TODO ret['dataquality'] = mdm.dataquality return ret def read_csw_info(self): csw_info = {} service = self.csw # Check each service instance conforms to OWSLib interface service.alias = 'CSW' csw_info['version'] = service.version csw_info['identification'] = {} # service.identification csw_info['identification']['type'] = service.identification.type csw_info['identification']['version'] = service.identification.version csw_info['identification']['title'] = service.identification.title csw_info['identification'][ 'abstract'] = service.identification.abstract csw_info['identification'][ 'keywords'] = service.identification.keywords csw_info['identification'][ 'accessconstraints'] = service.identification.accessconstraints csw_info['identification']['fees'] = service.identification.fees csw_info['provider'] = {} csw_info['provider']['name'] = service.provider.name csw_info['provider']['url'] = service.provider.url ctc = service.provider.contact contact = { 'name': ctc.name, 'organization': ctc.organization, 'site': ctc.site, 'instructions': ctc.instructions, 'email': ctc.email, 'country': ctc.country } csw_info['provider']['contact'] = contact csw_info['operations'] = [] for op in service.operations: methods = op.methods for method in methods: if type(method) == dict: constraints = [] for k, v in method.items(): if k == 'constraints': for c in v: if type(c) == dict: constraints.append(c) else: mc = {'name': c.name, 'values': c.values} constraints.append(mc) method['constraints'] = constraints operation = { 'name': op.name, 'formatOptions': op.formatOptions, 'methods': methods } csw_info['operations'].append(operation) self.csw_info.update(csw_info) return self.csw_info def get_original_url(self, harvest_id=None): # take the URL and add required params parts = urlparse(self.url) # urlparse('http://www.cwi.nl:80/%7Eguido/Python.html?q=90&p=881') # ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='q=90&p=881', fragment='') params = { 'SERVICE': 'CSW', 'VERSION': '2.0.2', 'REQUEST': 'GetRecordById', 'OUTPUTSCHEMA': 'http://www.isotc211.org/2005/gmd', 'OUTPUTFORMAT': 'application/xml', } if harvest_id is not None: params['ID'] = harvest_id url = urlunparse((parts.scheme, parts.netloc, parts.path, None, urlencode(params), None)) return url def validate(self): # TODO return True def remove_duplicated_identifiers(self): unique_identifiers = [] for dataset in self.datasets: idf = dataset['identifier'] if idf not in unique_identifiers: unique_identifiers.append(idf) else: self.duplicates.append(idf) self.datasets.remove(dataset) return self.duplicates def count_resources(self): """ read all datasets and count resources """ total = 0 for dataset in self.datasets: pass # TODO return total
def get_by_uuid(self, uuid): csw = CatalogueServiceWeb(self.base + "srv/en/csw") csw.getrecordbyid([uuid], outputschema=namespaces["gmd"]) recs = csw.records return recs.values()[0] if len(recs) > 0 else None
class CSWRepository(HarvestRepository): """ CSW Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "csw" super(CSWRepository, self).setRepoParams(repoParams) try: self.cswrepo = CatalogueServiceWeb(self.url) except: self.cswrepo = None self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "csw", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url } self.repository_id = self.db.update_repo(**kwargs) if self.cswrepo is None: self.logger.error("Could not initiate this repo to crawl it") return item_count = 0 while True: try: self.cswrepo.getrecords2(startposition=self.cswrepo.results['nextrecord']) except: self.cswrepo.getrecords2() for rec in self.cswrepo.records: result = self.db.write_header(self.cswrepo.records[rec].identifier, self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize( tdelta), item_count / tdelta)) if item_count == self.cswrepo.results['matches']: break self.logger.info("Found {} items in feed".format(item_count)) def format_csw_to_oai(self, csw_record, local_identifier): record = {} record["title"] = csw_record.title record["description"] = csw_record.abstract record["tags"] = csw_record.subjects record["identifier"] = local_identifier record["creator"] = self.name record["contact"] = self.contact record["series"] = "" return record def _rate_limited(max_per_second): """ Decorator that make functions not be called faster than a set rate """ threading = __import__('threading') lock = threading.Lock() min_interval = 1.0 / float(max_per_second) def decorate(func): last_time_called = [0.0] @wraps(func) def rate_limited_function(*args, **kwargs): lock.acquire() elapsed = time.clock() - last_time_called[0] left_to_wait = min_interval - elapsed if left_to_wait > 0: time.sleep(left_to_wait) lock.release() ret = func(*args, **kwargs) last_time_called[0] = time.clock() return ret return rate_limited_function return decorate @_rate_limited(5) def _update_record(self, record): if self.cswrepo is None: return self.cswrepo.getrecordbyid(id=[record['local_identifier']]) if self.cswrepo.records: csw_record = self.cswrepo.records[record['local_identifier']] oai_record = self.format_csw_to_oai(csw_record, record['local_identifier']) # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific self.cswrepo.getrecordbyid(id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd") oai_record["pub_date"] = self.cswrepo.records[record['local_identifier']].datestamp oai_record["pub_date"] = re.sub("[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "", oai_record["pub_date"]) if oai_record: self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata) return True else: # This record was deleted self.db.delete_record(record) return True return False
class HarvestNode(NgdsDataObject): """Stores information about harvest endpoints""" csw = None def __init__(self, url, **kwargs): # A URL must be given p = urlparse(url) self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path self.frequency = kwargs.get( 'frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly self.title = kwargs.get( 'title', 'No Title Was Given') # A title for bookkeeping self.node_admin_id = kwargs.get( 'node_admin_id', None ) # Foreign Key to a responsible_party who maintains the remote node #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests def setup_csw(self): self.csw = CatalogueServiceWeb(self.url) def do_harvest(self): """Perform a harvest from another CSW server""" if self.csw == None: self.setup_csw() self.get_records() # Do the first GetRecords request ids = self.csw.records.keys() # Start an array to house all of the ids print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) while self.csw.results["nextrecord"] < self.csw.results[ "matches"] and self.csw.results[ "nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything self.get_records( self.csw.results["nextrecord"], self.csw.results["returned"] ) # Get another set, starting from next_record from previous response ids += self.csw.records.keys() # Add new ids to the array print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"]) self.parse_records(ids) # Gather the records themselves def parse_records(self, ids): """Perform as many GetRecordById requests as needed""" print "Gathered %s IDs" % str(len(ids)) for record_id in ids: self.get_record_by_id(record_id) rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self) def get_record_by_id(self, record_id): """Get a single record, by ID""" params = { "id": [record_id], "outputschema": "http://www.isotc211.org/2005/gmd" } self.csw.getrecordbyid(**params) # Puts response in self.csw.records def get_records(self, start_position=1, max_records=1000): """Perform a GetRecords request""" params = { "typenames": "gmd:MD_Metadata", "outputschema": "http://www.isotc211.org/2005/gmd", "startposition": start_position, "maxrecords": max_records, "esn": "brief" } self.csw.getrecords(**params) # Puts results in self.csw.records
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'gl', supplier_title = u"Grand Lyon", target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_site_url = u'http://catalogue.data.grandlyon.com/geosource/srv/fr/csw' if not args.dry_run: harvester.retrieve_target() # Retrieve short infos of packages in source. csw = CatalogueServiceWeb(source_site_url) bad_indexes = [] index = 0 limit = 50 record_by_id = {} while True: try: csw.getrecords(maxrecords = limit, startposition = index) except: if limit == 1: # Bad record found. Skip it. bad_indexes.append(index) index += 1 limit = 50 else: # Retry one by one to find bad record and skip it. limit = 1 else: for id, record in csw.records.iteritems(): record_by_id[id] = record next_index = csw.results['nextrecord'] if next_index <= index: break index = next_index # Retrieve packages from source. formats = set() licenses_url = set() protocols = set() rights = set() temporals = set() types = set() for record_id in record_by_id.iterkeys(): csw.getrecordbyid(id = [record_id]) dc_record = csw.records[record_id] csw.getrecordbyid(id = [record_id], outputschema = 'http://www.isotc211.org/2005/gmd') gmd_record = csw.records.get(record_id) format = dc_record.format if format is not None: format = format.split(u' (', 1)[0] formats.add(format) copyright = dc_record.rights if copyright and isinstance(copyright, list): copyright = tuple(copyright) rights.add(copyright) if gmd_record is None: frequency = None else: for frequency_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo' '/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation' '/gmd:userDefinedMaintenanceFrequency/gts:TM_PeriodDuration', namespaces = namespaces): frequency = frequency_xml.text break else: frequency = None if frequency is not None: assert frequency in frequency_by_code, 'Unknown frequency: {}'.format(frequency) frequency = frequency_by_code[frequency] for uri in dc_record.uris: if uri['url'].startswith('http://opendata.data.grandlyon.com/Licence'): licenses_url.add(uri['url']) protocols.add(uri['protocol']) subjects = [ subject for subject in dc_record.subjects if subject != 'OpenData' ] groups = [ harvester.upsert_group(dict( title = subjects[0], )), ] if subjects else [] groups.append(harvester.upsert_group(dict( title = u'Territoires et Transports', ))) tags = [ dict(name = strings.slugify(subject)) for subject in subjects ] related = [] if gmd_record is None: resources = [ dict( description = uri.get('description') or None, format = { 'application/pdf': 'PDF', 'application/zip': 'ZIP', 'pdf': 'PDF', 'text/csv': 'CSV', 'text/plain': 'TXT', }.get(format, format), name = uri.get('name') or None, url = uri['url'], ) for uri in dc_record.uris if uri.get('protocol') in ('WWW:DOWNLOAD-1.0-http--download', 'WWW:LINK-1.0-http--link') and uri['url'].startswith('http://opendata.data.grandlyon.com/') and uri['url'] != 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf' ] else: kml_resource = False resources = [] for online in gmd_record.distribution.online: if online.url.startswith(( 'http://catalogue.data.grandlyon.com/geosource/srv/en/resources.get?id=', 'file:', 'jdbc:', )) \ or online.url == 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf': continue if online.protocol == 'OGC:WFS': if not kml_resource: resources.append(dict( description = online.description or None, format = 'KML', name = online.name or None, url = 'http://kml.data.grandlyon.com/grandlyon/?request=list&typename={}'.format( online.name), )) kml_resource = True if '?' not in online.url: resources.append(dict( description = online.description or None, format = 'GML', name = online.name or None, url = u'{}?SERVICE={}&REQUEST=GetFeature&VERSION=1.1.0&typename={}'.format(online.url, online.protocol.split(':', 1)[1], online.name), )) elif online.protocol == 'OGC:WMS': if '?' not in online.url: bounding_box = gmd_record.identification.extent.boundingBox related.append(dict( image_url = u'{}?SERVICE={}&REQUEST=GetMap&VERSION=1.1.1&LAYERS={}&FORMAT=image/png' u'&SRS=EPSG:4326&BBOX={},{},{},{}&WIDTH=400&HEIGHT=300'.format(online.url, online.protocol.split(':', 1)[1], online.name, bounding_box.minx, bounding_box.miny, bounding_box.maxx, bounding_box.maxy), title = u'Vignette', type = u'visualization', # url = None, )) resources.append(dict( description = online.description or None, format = { 'DB:POSTGIS': 'POSTGIS', 'FILE:RASTER': 'RASTER', 'OGC:WCS': 'WCS', 'OGC:WFS': 'WFS', 'OGC:WMS': 'WMS', 'WWW:DOWNLOAD-1.0-http--download': None, 'WWW:LINK-1.0-http--link': None, }[online.protocol], name = online.name or None, url = online.url, )) temporals.add(dc_record.temporal) types.add(dc_record.type) if args.dry_run: log.info(u'Harvested package: {}'.format(dc_record.title)) else: package = dict( frequency = { 'P0Y0M0DT0H1M0S': u"ponctuelle", }.get(frequency), license_id = { 'copyright': None, ('Licence ODbL GRAND LYON', u"Pas de restriction d'accès public"): u'odc-odbl', 'license': None, }.get(copyright), notes = u'\n\n'.join( fragment for fragment in ( dc_record.abstract, dc_record.source, ) if fragment ), resources = resources, tags = [ dict(name = strings.slugify(subject)) for subject in dc_record.subjects ], # territorial_coverage = TODO title = dc_record.title, # TODO: Use this URL once Grand Lyon is ready to use it. Before end of year. # url = u'http://smartdata.grandlyon.com/single/{}'.format(record_id), url = u'http://smartdata.grandlyon.com/', ) # if gmd_record is not None: # for graphic_filename_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo' # '/gmd:MD_DataIdentification/gmd:graphicOverview' # '/gmd:MD_BrowseGraphic[gmd:fileDescription/gco:CharacterString="large_thumbnail"]' # '/gmd:fileName/gco:CharacterString', # namespaces = namespaces): # related.append(dict( # image_url = urlparse.urljoin(base_url, unicode(graphic_filename_xml.text)), # title = u'Vignette', # type = u'visualization', # # url = TODO, # )) log.info(u'Harvested package: {}'.format(package['title'])) harvester.add_package(package, harvester.supplier, dc_record.title, package['url'], related = related or None) if not args.dry_run: harvester.update_target() log.info(u'Formats: {}'.format(sorted(formats))) log.info(u'Licenses: {}'.format(sorted(licenses_url))) log.info(u'Protocols: {}'.format(sorted(protocols))) log.info(u'Rights: {}'.format(sorted(rights))) log.info(u'Temporals: {}'.format(sorted(temporals))) log.info(u'Types: {}'.format(sorted(types))) return 0