class GeocatCatalogueServiceWeb(object): def __init__(self, url): self.csw = CatalogueServiceWeb(url) self.schema = CHE_SCHEMA def get_geocat_id_from_csw(self, cqlquery=CQL_QUERY_DEFAULT, cqlterm=CQL_SEARCH_TERM_DEFAULT): harvest_query = PropertyIsEqualTo(cqlquery, cqlterm) nextrecord = 0 record_ids = [] while nextrecord is not None: self.csw.getrecords2(constraints=[harvest_query], maxrecords=50, startposition=nextrecord) if self.csw.response is None or self.csw.results['matches'] == 0: raise CswNotFoundError( "No dataset found for harvest query {}".format( harvest_query)) if self.csw.results['returned'] > 0: if self.csw.results['nextrecord'] > 0: nextrecord = self.csw.results['nextrecord'] else: nextrecord = None for id in self.csw.records.keys(): record_ids.append(id) return record_ids def get_record_by_id(self, geocat_id): self.csw.getrecordbyid(id=[geocat_id], outputschema=self.schema) csw_record_as_string = self.csw.response if csw_record_as_string: return csw_record_as_string else: return None
def mdcount(self, cswurl, constraints=[], startrecord=0, maxharvest=10): """Queries the csw and count md matching constraints""" csw = CatalogueServiceWeb(cswurl, skip_caps=True) csw.getrecords2( esn="brief", constraints=constraints, startposition=startrecord, maxrecords=maxharvest, resulttype="hits" ) return csw.results
def test_GetRecords_summary(self): csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5, esn="summary") nrecords = len(csw.records)
def get_datasets_from_csw(csw_endpoint, extended): print 'Retrieving data sets from %s' % csw_endpoint csw = CatalogueServiceWeb(csw_endpoint) csw.getrecords2(esn='full', maxrecords=1000) parsed_records = csw.records.values() if extended: #request the data a second time in an encoding with different information #manually parse the second response and join the information in to the existing records csw.getrecords2(esn='full', maxrecords=1000, outputschema=namespaces['gmd']) unparsed_response = csw.response root = etree.XML(unparsed_response) for record in parsed_records: record_id = record.identifier xpath_record_fragment = record_xpath_template.format(record_id) for attribute, xpath_fragment in attribute_to_xpath_fragment.iteritems(): xpath_attribute_fragment = xpath_record_fragment + xpath_fragment value = root.xpath(xpath_attribute_fragment, namespaces=namespaces) if len(value) != 0: #unpack list value = value[0] else: value = "" setattr(record, attribute, value) print(vars(record)) return { 'datasets' : parsed_records, }
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) constraints = [PropertyIsEqualTo("dc:type", "dataset")] csw.getrecords2(constraints=constraints, outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
def deleteMetadata(name, login, password, url, workspace, verbose=False): #takes a keyword and delete metadata associated url_csw = url + "/geonetwork/srv/fre/csw-publication" keyword = workspace + ":" + name # keyword = geosync-restreint:baies_metadata__baies_metadata # Connect to a CSW, and inspect its properties: from owslib.csw import CatalogueServiceWeb csw = CatalogueServiceWeb(url_csw, skip_caps=True, username=login, password=password) # suppression des métadonnées relatives à "keyword" / couche geoserver # TODO faire sans owslib via API rest # ex: https://georchestra-mshe.univ-fcomte.fr/geonetwork/srv/fre/xml.search?any=geosync-ouvert:fouilles_chailluz__mobilier_pros # cette recherche est stricte au sens où cela trouve ce qui correspond exactement (exact match) par exemple la recherche de 'myworkspace:foo' ne trouve pas 'myworkspace:foobar' from owslib.fes import PropertyIsEqualTo, PropertyIsLike myquery = PropertyIsEqualTo( 'csw:AnyText', keyword ) # TODO vérifier que la recherche est stricte sinon risque de retourner des résultats non désirés (ex: 'myworkspace:foobar' en cherchant 'myworkspace:foo') if verbose: print "INFO CatalogueServiceWeb(" + url_csw + ",...) ... PropertyIsEqualTo('csw:AnyText'," + keyword + ")" csw.getrecords2(constraints=[myquery], maxrecords=10) resultat = csw.results if verbose: print "INFO " + str( csw.results['matches']) + " fiche(s) de metadata trouvée(s)" if csw.results['matches'] > 1: print "WARNING plus d'1 fiche de metadata trouvée pour " + keyword result = True # s'il y a au moins 1 erreur, retourne False for rec in csw.records: try: csw.transaction( ttype='delete', typename='MD_Metadata', identifier=csw.records[rec].identifier ) #marche apparement pour les metadonnees étant de type gmd:MD_Metadata result = True and result except Exception as e: sys.stderr.write("ERROR suppression de metadata pour " + keyword + " échouée : " + e.message + "\n") print "ERROR suppression de metadata pour " + keyword + " échouée" result = False try: identifier = csw.records[ rec].identifier #genere une erreur si pas d'identifiant # titre=csw.records[rec].title print "OK suppression de metadata " + keyword + " (" + identifier + ") réussie" except Exception as e: print "OK suppression de metadata réussie" print "WARNING " + e.message return result
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def test_GetRecords(self): # NB: This test fails because no records have been setup... raise SkipTest() # therefore skip csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records) #print csw.response[:1024] assert nrecords == 5, nrecords for ident in csw.records: identifiers.append(ident) assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
def csw_ajax(request, *args, **kwargs): if request.method == 'GET': csw_url = request.session['csw_url'] user = request.session['user'] password = request.session['password'] keywords = request.session['keywords'] keywords_query = [fes.PropertyIsLike( 'csw:AnyText', '%%%s%%' % keywords)] if not csw_url: return HttpResponseServerError() try: csw = CatalogueServiceWeb( csw_url, username=user, password=password) result = csw.identification.type if result == 'CSW': offset = int(request.GET['offset']) per_page = int(request.GET['perPage']) csw.getrecords2( typenames='gmd:MD_Metadata', esn='full', outputschema='http://www.isotc211.org/2005/gmd', constraints=keywords_query, startposition=offset, maxrecords=per_page) result = [] for key in csw.records: rec = csw.records[key] res = {} if isinstance(rec, MD_Metadata): res['id'] = rec.identifier res['title'] = rec.identification.title res['inasafe_keywords'] = rec.identification.\ supplementalinformation if res['inasafe_keywords']: res['inasafe_layer'] = ( '<inasafe_keywords/>' in res['inasafe_keywords']) result.append(res) json_result = { 'records': result, 'queryRecordCount': csw.results['matches'], 'totalRecordCount': csw.results['matches'] } return JsonResponse(json_result, safe=False) except Exception as e: LOGGER.exception(e) return HttpResponseServerError() return HttpResponseServerError()
def mdsearch(self, cswurl, esn="summary", constraints=[], startrecord=0, maxrecords=10, maxharvest=20): tstart = datetime.datetime.now() """Queries a csw to retrieve md ids matching constraints""" records = {} logging.info("searching max %s md from %s" % (maxharvest, cswurl)) csw = CatalogueServiceWeb(cswurl, skip_caps=True) first = True nextrecord = startrecord count = 0 while True: if not first: nextrecord = csw.results["nextrecord"] if count + maxrecords > maxharvest: maxrecords = maxharvest - count # retrieve exactly maxharvest md csw.getrecords2( esn=esn, constraints=constraints, startposition=nextrecord, maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA, ) if csw.results["matches"] == 0: logging.info("0 md found from %s" % cswurl) break else: first = False # fetch records for rec_id, rec in csw.records.iteritems(): count += 1 percent = int(float(count) / min(maxharvest, csw.results["matches"]) * 100) logging.debug("%s%% %s" % (percent, rec_id)) records[rec_id] = rec # get out if no records or beyond maxrecords if ( csw.results["nextrecord"] == 0 or csw.results["returned"] == 0 or csw.results["nextrecord"] > csw.results["matches"] or csw.results["nextrecord"] > maxharvest ): d = (datetime.datetime.now() - tstart).total_seconds() logging.info("%s md found from %s in %d s" % (count, cswurl, d)) break return records
def test_default_csw_connections(): """test that the default CSW connections work""" relpath = 'resources%sconnections-default.xml' % os.sep csw_connections_xml = options.base.plugin / relpath conns = etree.parse(csw_connections_xml) for conn in conns.findall('csw'): try: csw = CatalogueServiceWeb(conn.attrib.get('url')) info('Success: %s', csw.identification.title) csw.getrecords2() except Exception as err: raise ValueError('ERROR: %s', err)
def test_bbox(endpoints,bbox): for title,url in endpoints.iteritems(): try: csw = CatalogueServiceWeb(url, timeout=40) if "BBOX" in csw.filters.spatial_operators: filter_list = [fes.BBox(bbox)] try: csw.getrecords2(constraints=filter_list, maxrecords=1000) print("%s : Datasets = %d" % (title,len(csw.records.keys()))) except Exception: print "%s : BBOX Query FAILS" % title else: print "%s - BBOX Query NOT supported" % title except Exception: print "%s - Timed out" % title
def test_default_csw_connections(): """test that the default CSW connections work""" relpath = 'resources%sconnections-default.xml' % os.sep csw_connections_xml = options.base.plugin / relpath conns = etree.parse(csw_connections_xml) for conn in conns.findall('csw'): try: csw = CatalogueServiceWeb(conn.attrib.get('url')) info('Success: %s', csw.identification.title) csw.getrecords2() except Exception as err: raise ValueError('ERROR: %s', err)
def test_csw_skgeodsy(): c = CatalogueServiceWeb(SERVICE_URL) assert sorted([op.name for op in c.operations]) == [ 'DescribeRecord', 'GetCapabilities', 'GetRecordById', 'GetRecords', 'Transaction' ] grop = c.get_operation_by_name('GetRecords') assert grop.name == 'GetRecords' c.getrecords2(typenames='csw:Record gmd:MD_Metadata') assert c.results.get('returned') > 0 assert c.results.get('nextrecord') > 0 assert c.results.get('matches') > 0
def test_bbox(endpoints, bbox): for title, url in endpoints.iteritems(): try: csw = CatalogueServiceWeb(url, timeout=40) if "BBOX" in csw.filters.spatial_operators: filter_list = [fes.BBox(bbox)] try: csw.getrecords2(constraints=filter_list, maxrecords=1000) print("%s : Datasets = %d" % (title, len(csw.records.keys()))) except Exception: print "%s : BBOX Query FAILS" % title else: print "%s - BBOX Query NOT supported" % title except Exception: print "%s - Timed out" % title
def csw_query_metadata_by_id(csw_url, identifier, username=None, password=None): csw = CatalogueServiceWeb(csw_url, username=username, password=password) result = csw.identification.type record = None if result == 'CSW': constraints = [fes.PropertyIsEqualTo('dc:identifier', identifier)] csw.getrecords2(typenames='gmd:MD_Metadata', esn='full', outputschema='http://www.isotc211.org/2005/gmd', constraints=constraints) for key in csw.records: record = csw.records[key] return record
def mdsearch(self, cswurl, esn='summary', constraints=[], startrecord=0, maxrecords=10, maxharvest=20): tstart = datetime.datetime.now() """Queries a csw to retrieve md ids matching constraints""" records = {} logging.info('searching max %s md from %s' % (maxharvest, cswurl)) csw = CatalogueServiceWeb(cswurl, skip_caps=True) first = True nextrecord = startrecord count = 0 while True: if not first: nextrecord = csw.results['nextrecord'] if count + maxrecords > maxharvest: maxrecords = maxharvest - count # retrieve exactly maxharvest md csw.getrecords2( esn=esn, constraints=constraints, startposition=nextrecord, maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA) if csw.results['matches'] == 0: logging.info('0 md found from %s' % cswurl) break else: first = False # fetch records for rec_id, rec in csw.records.iteritems(): count += 1 percent = int(float(count)/min(maxharvest, csw.results['matches'])*100) logging.debug('%s%% %s' % (percent, rec_id)) records[rec_id] = rec # get out if no records or beyond maxrecords if csw.results['nextrecord'] == 0 \ or csw.results['returned'] == 0 \ or csw.results['nextrecord'] > csw.results['matches'] \ or csw.results['nextrecord'] > maxharvest: d = (datetime.datetime.now() - tstart).total_seconds() logging.info('%s md found from %s in %d s' % (count, cswurl, d)) break return records
def get_uuid_from_title(csw_url, title): ''' Function to return OWSLib CSW record record from specified CSW URL using title as the search criterion Sample UUID: 221dcfd8-03d7-5083-e053-10a3070a64e3 ''' MAXRECORDS = 200 uuid = None csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url search_title = title.replace('_', '%') while search_title and len(title) - len( search_title) < 10 and not uuid: title_query = PropertyIsEqualTo('csw:Title', '%' + search_title + '%') csw.getrecords2(constraints=[title_query], esn='summary', maxrecords=MAXRECORDS) if not csw.records: # No records found # Broaden search by shortening title search_title = search_title[0:-1] else: uuid_list = [] # Strip all non-alphanumeric characters from title alphanumeric_title = re.sub('\W', '', title) while not uuid_list: uuid_list = [ identifier for identifier in csw.records.keys() if alphanumeric_title in re.sub( '\W', '', csw.records[identifier].title) ] if len(uuid_list) == 1: # Unique match found uuid = uuid_list[0] logger.info('UUID %s found from title characters', uuid) break else: # Broaden search by shortening munged_title alphanumeric_title = alphanumeric_title[0:-1] return uuid
def get_csw_results(protocol, maxresults=0): csw = CatalogueServiceWeb(CSW_URL) svc_owner = "Beheer PDOK" query = ( f"type='service' AND organisationName='{svc_owner}' AND protocol='{protocol}'" ) md_ids = [] start = 1 maxrecord = maxresults if (maxresults < 100 and maxresults != 0) else 100 while True: csw.getrecords2(maxrecords=maxrecord, cql=query, startposition=start) md_ids.extend([{"mdId": rec, "protocol": protocol} for rec in csw.records]) if len(md_ids) >= maxresults: break if csw.results["nextrecord"] != 0: start = csw.results["nextrecord"] continue break return md_ids
def initialize(self): startposition = 0 csw = CatalogueServiceWeb(self.source.url) csw.getrecords2(maxrecords=1) matches = csw.results.get("matches") while startposition <= matches: csw.getrecords2(maxrecords=100, startposition=startposition) startposition = csw.results.get('nextrecord') for rec in csw.records: item = {} record = csw.records[rec] item["id"] = record.identifier item["title"] = record.title item["description"] = record.abstract item["url"] = record.references[0].get('url') item["type"] = record.type self.add_item(record.identifier, title=record.title, date=None, item=item)
def _from_csw(url): result = [] schema = 'http://www.isotc211.org/2005/gmd' src = CatalogueServiceWeb(url) stop = 0 flag = 0 maxrecords = 5 while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords2(esn='full', startposition=startposition, maxrecords=maxrecords, outputschema=schema) if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > src.results['matches']: # end the loop, exhausted all records stop = 1 break # harvest each record for key, value in src.records.items(): item = convert_from_iso(value) item['properties']['harvested_from'] = url item['properties']['harvest_json'] = value.xml.decode() result.append(item) flag = 1 return result
def get_filepaths(self, collection_id: str, spatial_extent: List[float], temporal_extent: List[str]) -> List[str]: """Retrieve a file list from the a CSW server according to the specified parameters. Arguments: collecion_id {str} -- identifier of the collection spatial_extent {List[float]} -- bounding box [ymin, xmin, ymax, xmax] temporal_extent {List[str]} -- e.g. ["2018-06-04", "2018-06-23"] Returns: list -- list of filepaths """ csw = CatalogueServiceWeb(self.csw_server_uri, timeout=300) constraints = [] constraints.append(PropertyIsLike(self.group_property, collection_id)) # Spatial filter constraints.append(BBox(spatial_extent)) # Temporal filter constraints.append( PropertyIsGreaterThan('apiso:TempExtent_begin', temporal_extent[0])) constraints.append( PropertyIsLessThan('apiso:TempExtent_end', temporal_extent[1])) # Run the query csw.getrecords2(constraints=[constraints], maxrecords=100) # Put found records in a variable (dictionary) records0 = csw.records # Sort records records = [] for record in records0: records.append(records0[record].references[0]['url']) records = sorted(records) return records
def get_uuid_from_title(csw_url, title): ''' Function to return OWSLib CSW record record from specified CSW URL using title as the search criterion Sample UUID: 221dcfd8-03d7-5083-e053-10a3070a64e3 ''' MAXRECORDS = 200 uuid = None csw = CatalogueServiceWeb(csw_url) assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url search_title = title.replace('_', '%') while search_title and len( title) - len(search_title) < 10 and not uuid: title_query = PropertyIsEqualTo( 'csw:Title', '%' + search_title + '%') csw.getrecords2( constraints=[title_query], esn='summary', maxrecords=MAXRECORDS) if not csw.records: # No records found # Broaden search by shortening title search_title = search_title[0:-1] else: uuid_list = [] # Strip all non-alphanumeric characters from title alphanumeric_title = re.sub('\W', '', title) while not uuid_list: uuid_list = [identifier for identifier in csw.records.keys( ) if alphanumeric_title in re.sub('\W', '', csw.records[identifier].title)] if len(uuid_list) == 1: # Unique match found uuid = uuid_list[0] logger.info( 'UUID %s found from title characters', uuid) break else: # Broaden search by shortening munged_title alphanumeric_title = alphanumeric_title[0:-1] return uuid
def sync(): """ Sync a bunch of csw layers with Solr. """ class cswLayer(object): pass csw = CatalogueServiceWeb( 'http://hh.worldmap.harvard.edu/registry/hypermap/csw') count = 0 for startposition in range(0, 2000, 10): csw.getrecords2(maxrecords=10, startposition=startposition) print csw.results for uuid in csw.records: record = csw.records[uuid] if record.bbox.crs.code == 4326: print record.bbox.minx, record.bbox.miny, record.bbox.maxx, record.bbox.maxy layer = cswLayer() layer.bbox_x0 = float(record.bbox.minx) layer.bbox_y0 = float(record.bbox.miny) layer.bbox_x1 = float(record.bbox.maxx) layer.bbox_y1 = float(record.bbox.maxy) if layer.bbox_x0 >= -180 and layer.bbox_x1 <= 180 and layer.bbox_y0 >= -90 and layer.bbox_y1 <= 90: layer.uuid = uuid layer.name = uuid layer.title = record.title layer.abstract = record.abstract layer.url = record.source layer.date = get_random_date() layer.category = get_random_category() layer.regions = add_random_regions() layer.keywords = add_random_keywords() layer_to_solr(layer) print 'Imported record %s' % count count += 1 print 'Done.'
def harvest_csw(src_url, dest_url): stop = 0 flag = 0 maxrecords = 10 src = CatalogueServiceWeb(src_url) dest = CatalogueServiceWeb(dest_url) while stop == 0: if flag == 0: # first run, start from 0 startposition = 0 else: # subsequent run, startposition is now paged startposition = src.results['nextrecord'] src.getrecords2(esn='full', startposition=startposition, maxrecords=maxrecords) print(src.results) if src.results['nextrecord'] == 0 \ or src.results['returned'] == 0 \ or src.results['nextrecord'] > src.results['matches']: # end the loop, exhausted all records stop = 1 break # harvest each record to destination CSW for i in list(src.records): print "insert", i src.getrecordbyid(id=[i], outputschema='http://www.isotc211.org/2005/gmd') md = src._exml.find('{http://www.isotc211.org/2005/gmd}MD_Metadata') f = open('/tmp/a.xml', 'w') f.write(etree.tostring(md)) f.close() dest.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("/tmp/a.xml").read()) flag = 1
def show_add_layer_dialog(request, *args, **kwargs): if request.method == 'POST': csw_url = request.session['csw_url'] user = request.session['user'] password = request.session['password'] layer_id = request.POST['layer_id'] try: csw = CatalogueServiceWeb(csw_url, username=user, password=password) constraints = [fes.PropertyIsEqualTo('dc:identifier', layer_id)] result = csw.identification.type if result == 'CSW': csw.getrecords2(constraints=constraints) record = None for key in csw.records: rec = csw.records[key] for ref in rec.references: if 'OGC:WCS' in ref['scheme']: rec.type = 'WCS' rec.endpoint = ref['url'] record = rec break if 'OGC:WFS' in ref['scheme']: rec.type = 'WFS' rec.endpoint = ref['url'] record = rec break if record: break if record.type == 'WCS': # get describe coverage # find coverage id from references coverage_id = None version = None for ref in record.references: if 'service=WCS' in ref['url']: url = ref['url'] parse_result = urlparse.urlparse(url) query = parse_result.query query_dict = urlparse.parse_qs(query) coverage_id = query_dict['coverageid'][0] version = query_dict['version'][0] if coverage_id and version: break record.service_id = coverage_id record.service_version = version elif record.type == 'WFS': typename = None version = None for ref in record.references: if 'service=WFS' in ref['url']: url = ref['url'] parse_result = urlparse.urlparse(url) query = parse_result.query query_dict = urlparse.parse_qs(query) typename = query_dict['typename'][0] version = query_dict['version'][0] if typename and version: break record.service_id = typename record.service_version = version # wcs = WebCoverageService(record.endpoint) # result = wcs.getDescribeCoverage(coverage_id) context = {'record': record} return render(request, 'geosafe/metasearch/modal/add_layer.html', context) except: return HttpResponseServerError() return HttpResponseServerError()
def main(): # Connection variables csw_url = 'csw.open.canada.ca/geonetwork/srv/csw' csw_user = None csw_passwd = None proxy_protocol = None proxy_url = None proxy_user = None proxy_passwd = None # Or read from a .ini file harvester_file = 'config/harvester.ini' if os.path.isfile(harvester_file): from ConfigParser import ConfigParser ini_config = ConfigParser() ini_config.read(harvester_file) csw_url = ini_config.get( 'csw', 'url') if ini_config.has_option('csw', 'username'): csw_user = ini_config.get( 'csw', 'username') csw_passwd = ini_config.get( 'csw', 'password') proxy_protocol = ini_config.get( 'proxy', 'protocol') proxy_url = ini_config.get( 'proxy', 'url') if ini_config.has_option('proxy', 'username'): proxy_user = ini_config.get( 'proxy', 'username') proxy_passwd = ini_config.get( 'proxy', 'password') records_per_request = int(ini_config.get( 'processing', 'records_per_request')) start_date = ini_config.get('processing', 'start_date') # If your supplying a proxy if proxy_url: # And your using authentication if proxy_user and proxy_passwd: password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password( None, proxy_url, proxy_user, proxy_passwd) proxy_auth_handler = urllib2.ProxyBasicAuthHandler(password_mgr) # or even if your not else: proxy_auth_handler = urllib2.ProxyHandler( {proxy_protocol: proxy_url}) opener = urllib2.build_opener(proxy_auth_handler) urllib2.install_opener(opener) # Fetch the data # csw = CatalogueServiceWeb( # 'http://*****:*****@csw_url/geonetwork/srv/csw') if csw_user and csw_passwd: csw = CatalogueServiceWeb( 'http://'+csw_url, username=csw_user, password=csw_passwd, timeout=20) else: csw = CatalogueServiceWeb('http://'+csw_url, timeout=20) request_template = """<?xml version="1.0"?> <csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord" maxRecords="%d" startPosition="%d" > <csw:Query typeNames="gmd:MD_Metadata"> <csw:ElementSetName>full</csw:ElementSetName> <csw:Constraint version="1.1.0"> <Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"> <PropertyIsGreaterThanOrEqualTo> <PropertyName>Modified</PropertyName> <Literal>%s</Literal> </PropertyIsGreaterThanOrEqualTo> </Filter> </csw:Constraint> </csw:Query> </csw:GetRecords> """ # Is there a specified start date if arguments['-f']: start_date = arguments['-f'] active_page = 0 next_record = 1 request_another = True while request_another: request_another = False # Filter records into latest updates # # Sorry Tom K., we'll be more modern ASAWC. # For now it's good ol' Kitchen Sink # # from owslib.fes import PropertyIsGreaterThanOrEqualTo # modified = PropertyIsGreaterThanOrEqualTo( # 'apiso:Modified', # '2015-04-04' # ) # csw.getrecords2(constraints=[modified]) # # Kitchen Sink is the valid HNAP, we need HNAP for R1 to debug issues # This filter was supplied by EC, the CSW service technical lead current_request = request_template % ( records_per_request, next_record, start_date ) # (active_page*records_per_request)+1 csw.getrecords2(format='xml', xml=current_request) active_page += 1 # Identify if we need to continue this. records_root = ("/csw:GetRecordsResponse") # Read the file, should be a streamed input in the future root = etree.XML(csw.response) # Parse the root and itterate over each record records = fetchXMLArray(root, records_root) timestamp = fetchXMLAttribute( records[0], "csw:SearchStatus", "timestamp")[0] number_of_records_matched = int(fetchXMLAttribute( records[0], "csw:SearchResults", "numberOfRecordsMatched")[0] ) number_of_records_returned = int(fetchXMLAttribute( records[0], "csw:SearchResults", "numberOfRecordsReturned")[0] ) next_record = int(fetchXMLAttribute( records[0], "csw:SearchResults", "nextRecord")[0] ) if next_record > number_of_records_matched or next_record == 0: pass else: request_another = True # When we move to Tom K's filter we can use results in an R2 unified # harvester # print csw.results # for rec in csw.records: # print '* '+csw.records[rec].title # Till then we need to collect and dump the response from the CSW # No use minimizing the XML to try to create a XML Lines file as the # data has carriage returns. # parser = etree.XMLParser(remove_blank_text=True) # elem = etree.XML(csw.response, parser=parser) # print etree.tostring(elem) # Output the harvested page print csw.response
# ## search NODC geoportal # In[2]: #endpoint = 'http://data.nodc.noaa.gov/geoportal/csw' endpoint = 'http://www.nodc.noaa.gov/geoportal/csw' val = 'wms' filter2 = fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val), escapeChar='\\',wildCard='*',singleChar='?') csw = CatalogueServiceWeb(endpoint,timeout=60) filter_list = [filter2] csw.getrecords2(constraints=filter_list, maxrecords=1000) print(len(csw.records.keys())) choice=np.random.choice(list(csw.records.keys())) print(csw.records[choice].title) csw.records[choice].references # In[3]: csw.request # ## search geoport pycsw # In[5]:
class CSWUtils(object): ''' CSW query utilities ''' DEFAULT_CSW_URL = 'http://ecat.ga.gov.au/geonetwork/srv/eng/csw' # GA's externally-facing eCat DEFAULT_TIMEOUT = 30 # Timeout in seconds DEFAULT_CRS = 'EPSG:4326' # Unprojected WGS84 DEFAULT_MAXQUERYRECORDS = 100 # Retrieve only this many datasets per CSW query DEFAULT_MAXTOTALRECORDS = 2000 # Maximum total number of records to retrieve def __init__(self, csw_url=None, timeout=None, debug=False ): ''' Constructor for CSWUtils class @param csw_url: URL for CSW service. Defaults to value of CSWUtils.DEFAULT_CSW_URL @param timeout: Timeout in seconds. Defaults to value of CSWUtils.DEFAULT_TIMEOUT ''' csw_url = csw_url or CSWUtils.DEFAULT_CSW_URL timeout = timeout or CSWUtils.DEFAULT_TIMEOUT self.debug = debug self.csw = CatalogueServiceWeb(csw_url, timeout=timeout) def list_from_comma_separated_string(self, comma_separated_string): ''' Helper function to return list of strings from a comma-separated string Substitute single-character wildcard for whitespace characters @param comma_separated_string: comma-separated string @return: list of strings ''' return [re.sub('(\s)', '_', keyword.strip()) for keyword in comma_separated_string.split(',')] # A helper function for date range filtering def get_date_filter(self, start_datetime=None, stop_datetime=None, constraint='overlaps'): ''' Helper function to return a list containing a pair of FES filters for a date range @param start_datetime: datetime object for start of time period to search @param stop_datetime: datetime object for end of time period to search @param constraint: string value of either 'overlaps' or 'within' to indicate type of temporal search @return: list containing a pair of FES filters for a date range ''' if start_datetime: start_date_string = start_datetime.date().isoformat() else: start_date_string = '1900-01-01' # Distant past if start_datetime: stop_date_string = stop_datetime.date().isoformat() else: stop_date_string = '2100-01-01' # Distant future if constraint == 'overlaps': start_filter = fes.PropertyIsLessThanOrEqualTo(propertyname='ows100:TempExtent_begin', literal=stop_date_string) stop_filter = fes.PropertyIsGreaterThanOrEqualTo(propertyname='ows100:TempExtent_end', literal=start_date_string) elif constraint == 'within': start_filter = fes.PropertyIsGreaterThanOrEqualTo(propertyname='ows100:TempExtent_begin', literal=start_date_string) stop_filter = fes.PropertyIsLessThanOrEqualTo(propertyname='ows100:TempExtent_end', literal=stop_date_string) return [start_filter, stop_filter] def any_case_match_filters(self, propertyname, word_list): ''' Helper function to return and/or filter from a word list for partial case insensitivity Resolves issue where "matchCase=False" is ignored @param propertyname: String denoting CSW property to search @param word_list: List of strings for (partially) case-insensitive "and" search @return: List of FES filters expressing "and" query ''' filter_list = [] for word in word_list: word_variant_set = set([word, word.upper(), word.lower(), word.title()]) if len(word_variant_set) == 1: # Use single filter if no "or" required filter_list.append(fes.PropertyIsLike(propertyname=propertyname, literal=word_variant_set.pop(), matchCase=False)) else: filter_list.append(fes.Or([fes.PropertyIsLike(propertyname=propertyname, literal=word_variant, matchCase=False) for word_variant in set([word, word.upper(), word.lower(), word.title()]) ] ) ) return filter_list def get_csw_records(self, fes_filters, max_query_records=None, max_total_records=None ): ''' Generator yeilding nested dicts containing information about each CSW query result record including distributions @param fes_filters: List of fes filters to apply to CSW query @param max_query_records: Maximum number of records to return per CSW query. Defaults to value of CSWUtils.DEFAULT_MAXRECORDS @param max_total_records: Maximum total number of records to return. Defaults to value of CSWUtils.DEFAULT_MAXTOTALRECORDS ''' max_query_records = max_query_records or CSWUtils.DEFAULT_MAXQUERYRECORDS max_total_records = max_total_records or CSWUtils.DEFAULT_MAXTOTALRECORDS startposition = 1 # N.B: This is 1-based, not 0-based record_count = 0 # Keep querying until all results have been retrieved while record_count < max_total_records: # apply all the filters using the "and" syntax: [[filter1, filter2]] self.csw.getrecords2(constraints=[fes_filters], esn='full', outputschema='http://www.isotc211.org/2005/gmd', maxrecords=max_query_records, startposition=startposition) if self.debug: print 'CSW request:\n%s' % self.csw.request print 'CSW response:\n %s' % self.csw.response query_record_count = len(self.csw.records) for uuid in self.csw.records.keys(): record = self.csw.records[uuid] record_count += 1 yield uuid, record.xml #print '%d distribution(s) found for "%s"' % (len(info_list), title) if record_count >= max_total_records: # Don't go around again for another query - maximum retrieved raise Exception('Maximum number of records retrieved ($d)' % max_total_records) if query_record_count < max_query_records: # Don't go around again for another query - should be the end break # Increment start position and repeat query startposition += max_query_records #print '%d records found.' % record_count def query_csw(self, keyword_list=None, bounding_box=None, bounding_box_crs=None, anytext_list=None, titleword_list=None, start_datetime=None, stop_datetime=None, max_total_records=None ): ''' Function to query CSW using AND combination of provided search parameters and return generator object yielding nested dicts containing information about each record including distributions @param keyword_list: List of strings or comma-separated string containing keyword search terms @param bounding_box: Bounding box to search as a list of ordinates [bbox.minx, bbox.minx, bbox.maxx, bbox.maxy] @param bounding_box_crs: Coordinate reference system for bounding box. Defaults to value of CSWUtils.DEFAULT_CRS @param anytext_list: List of strings or comma-separated string containing any text search terms @param titleword: List of strings or comma-separated string containing title search terms @param start_datetime: Datetime object defining start of temporal search period @param stop_datetime: Datetime object defining end of temporal search period @param max_total_records: Maximum total number of records to return. Defaults to value of CSWUtils.DEFAULT_MAXTOTALRECORDS @return: generator object yielding nested dicts containing information about each record including distributions ''' bounding_box_crs = bounding_box_crs or CSWUtils.DEFAULT_CRS # Convert strings to lists if required if type(keyword_list) == str: keyword_list = self.list_from_comma_separated_string(keyword_list) if type(anytext_list) == str: anytext_list = self.list_from_comma_separated_string(anytext_list) if type(titleword_list) == str: titleword_list = self.list_from_comma_separated_string(titleword_list) # Build filter list fes_filter_list = [] # Check for unchanged, upper-case, lower-case and capitalised keywords if keyword_list: fes_filter_list += self.any_case_match_filters('Subject', keyword_list) if anytext_list: fes_filter_list += [fes.PropertyIsLike(propertyname='anyText', literal=phrase, matchCase=False) for phrase in anytext_list] if start_datetime or stop_datetime: fes_filter_list += self.get_date_filter(start_datetime, stop_datetime) if titleword_list: fes_filter_list += [fes.PropertyIsLike(propertyname='title', literal=titleword, matchCase=False) for titleword in titleword_list] if bounding_box: fes_filter_list += [fes.BBox(bounding_box, crs=bounding_box_crs)] assert fes_filter_list, 'No search criteria defined' # Use single filter if no "and" required if len(fes_filter_list) == 1: fes_filter_list = fes_filter_list[0] # Return generator object return self.get_csw_records(fes_filter_list, max_total_records=max_total_records)
def mdcache(self, cswurl, constraints=[], maxrecords=10, maxharvest=20): """Fills the cache from a csw""" # cache directory for this csw if cswurl in self.cswlist: cswsig = self.cswlist[cswurl]['cswsig'] cswpath = os.path.join(self.cachepath, cswsig) else: cswsig = hashlib.md5(cswurl).hexdigest() logging.info('%s : new signature %s' % (cswurl, cswsig)) cswpath = os.path.join(self.cachepath, cswsig) if not(os.path.isdir(cswpath)): os.makedirs(cswpath) logging.info('%s %s created' % (cswurl, cswpath)) manifest = { 'cswsig': cswsig, 'cswurl': cswurl, 'domains': { 'organisationName': self.mdPropertyValues(cswurl, 'organisationName').values()[0], 'Subject': self.mdPropertyValues(cswurl, 'Subject').values()[0] } } f = open(os.path.join(cswpath, self.MANIFEST), 'w') f.write(json.dumps(manifest)) f.close() logging.info('loading max %s md from %s' % (maxharvest, cswurl)) csw = CatalogueServiceWeb(cswurl, skip_caps=True) first = True nextrecord = 0 count = 0 while True: if not first: nextrecord = csw.results['nextrecord'] if count + maxrecords > maxharvest: maxrecords = maxharvest - count csw.getrecords2( esn='full', constraints=constraints, startposition=nextrecord, maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA) if csw.results['matches'] == 0: logging.info('0 md found from %s' % cswurl) break else: first = False # fetch records for rec_id, rec in csw.records.iteritems(): count += 1 logging.info(str(int(float(count)/min(maxharvest, csw.results['matches'])*100))+'%') filename = os.path.join(cswpath, rec_id) os.path.join(filename) f = open(filename, 'w') f.write(rec.xml) f.close() # break if no records, beyond maxrecords or matches if csw.results['nextrecord'] == 0 \ or csw.results['returned'] == 0 \ or csw.results['nextrecord'] > csw.results['matches'] \ or csw.results['nextrecord'] > maxharvest: logging.info('%s md loaded from %s' % (count, cswurl)) break return cswpath
# <markdowncell> # #### Query CSW Servers using filters # <codecell> titles = [] urls = [] service_types = [] servers = [] for url in bbox_endpoints: print "*", url try: csw = CatalogueServiceWeb(url, timeout=20) csw.getrecords2(constraints=[filters], maxrecords=200, esn='full') for record, item in csw.records.items(): try: # Get title if len(item.title) > 80: title = "{!s}...{!s}".format(item.title[0:40], item.title[-40:]) else: title = item.title service_url, scheme = next(((d['url'], d['scheme']) for d in item.references), None) if service_url: print " [x] {!s}".format(title) titles.append(item.title) urls.append(service_url) service_types.append(scheme) servers.append(url) except:
def _parse_csw(context, repos, record, identifier, pagesize=10): from owslib.csw import CatalogueServiceWeb recobjs = [] # records serviceobj = repos.dataset() # if init raises error, this might not be a CSW md = CatalogueServiceWeb(record) LOGGER.debug('Setting CSW service metadata') # generate record of service instance _set(context, serviceobj, 'pycsw:Identifier', identifier) _set(context, serviceobj, 'pycsw:Typename', 'csw:Record') _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2') _set(context, serviceobj, 'pycsw:MdSource', record) _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now()) _set(context, serviceobj, 'pycsw:XML', md.response) _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml)) _set(context, serviceobj, 'pycsw:Type', 'service') _set(context, serviceobj, 'pycsw:Title', md.identification.title) _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract) _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords)) _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name) _set(context, serviceobj, 'pycsw:Publisher', md.provider.name) _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name) _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name) _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints) _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees) _set(context, serviceobj, 'pycsw:Source', record) _set(context, serviceobj, 'pycsw:Format', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type) _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version) _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations])) _set(context, serviceobj, 'pycsw:CouplingType', 'tight') links = [ '%s,OGC-CSW Catalogue Service for the Web,OGC:CSW,%s' % (identifier, md.url) ] _set(context, serviceobj, 'pycsw:Links', '^'.join(links)) recobjs.append(serviceobj) # get all supported typenames of metadata # so we can harvest the entire CSW # try for ISO, settle for Dublin Core csw_typenames = 'csw:Record' csw_outputschema = 'http://www.opengis.net/cat/csw/2.0.2' grop = md.get_operation_by_name('GetRecords') if all(['gmd:MD_Metadata' in grop.parameters['typeNames']['values'], 'http://www.isotc211.org/2005/gmd' in grop.parameters['outputSchema']['values']]): LOGGER.info('CSW supports ISO') csw_typenames = 'gmd:MD_Metadata' csw_outputschema = 'http://www.isotc211.org/2005/gmd' # now get all records # get total number of records to loop against try: md.getrecords2(typenames=csw_typenames, resulttype='hits', outputschema=csw_outputschema) matches = md.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(md.response) if pagesize > matches: pagesize = matches LOGGER.debug('Harvesting %d CSW records' % matches) # loop over all catalogue records incrementally for r in range(1, matches, pagesize): try: md.getrecords2(typenames=csw_typenames, startposition=r, maxrecords=pagesize, outputschema=csw_outputschema) except Exception, err: # this is a CSW, but server rejects query raise RuntimeError(md.response) for k, v in md.records.iteritems(): if csw_typenames == 'gmd:MD_Metadata': recobjs.append(_parse_iso(context, repos, etree.fromstring(v.xml))) else: recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
def test_csw_geonetwork(): c = CatalogueServiceWeb(SERVICE_URL) c.getrecords2(typenames='csw:Record') assert c.results.get('returned') > 0 assert c.results.get('nextrecord') > 0 assert c.results.get('matches') > 0
from owslib.fes import PropertyIsLike from owslib.csw import CatalogueServiceWeb csw = CatalogueServiceWeb('http://geoland2.ipma.pt/csw') query_like = PropertyIsLike('dc:title', 'Global Land Surface Temperature %') csw.getrecords2(constraints=[query_like]) csw.results['matches']
class CSWConnexion(object): def __init__(self, csw_discovery_url, csw_publication_url, gn_username=None, gn_password=None): # TODO: make parameters for username and password self.getrecordbyidparam = '?request=GetRecordById&service=CSW&version=2.0.2&namespace=xmlns%28csw=http://www.opengis.net/cat/csw%29&resultType=results&outputSchema=http://www.isotc211.org/2005/gmd&outputFormat=application/xml&typeNames=csw:Record&elementSetName=full&constraintLanguage=CQL_TEXT&constraint_language_version=1.1.0' self.csw_discovery_url = csw_discovery_url self.csw_publication_url = csw_publication_url try: if gn_username is not None and gn_password is not None: self.csw_publication = CatalogueServiceWeb( self.csw_publication_url, username=gn_username, password=gn_password) self.csw_discovery = CatalogueServiceWeb( self.csw_discovery_url, username=gn_username, password=gn_password) except ServiceException: self.csw_publication = None self.csw_discovery = None try: self.csw_discovery = CatalogueServiceWeb(self.csw_discovery_url) except ServiceException: self.csw_discovery = None def get_record_list( self, keywords=[], typerecord=[] ): #par defaut les types et les keywords a selectionner sont vides --> toutes les fiches sont selectionnees from owslib.fes import PropertyIsEqualTo constraints = [] if len(typerecord) > 0: for type in typerecord: print(type) constraints.append(PropertyIsEqualTo('dc:type', type)) if len(keywords) > 0: for keyword in keywords: constraints.append(PropertyIsEqualTo('dc:subject', keyword)) constraints.append(PropertyIsEqualTo('keyword', keyword)) self.csw_discovery.getrecords2(constraints=constraints, maxrecords=100000) print(self.csw_discovery.results) self.identifiers = [] for rec in self.csw_discovery.records: self.identifiers.append(self.csw_discovery.records[rec].identifier) def updaterecord(self, id, input_xml): if self.csw_publication is not None: if isinstance(input_xml, etree._Element): input_xml = etree.tostring(input_xml) self.csw_publication.transaction(ttype='update', typename='gmd:MD_Metadata', record=input_xml, identifier=id) print("updated record " + id) else: print( "Could not perform operation as password-protected CSW is unreachable." ) def getrecordxml(self, id): """ fetches the metadata where the fileIdentifier is equal to id """ f = furl(self.csw_discovery_url + self.getrecordbyidparam) f.args['id'] = id r = http.request('GET', f.url) publishedrecord = r.data #publishedrecord=urlopen(f.url) tree = etree.fromstring(publishedrecord) #tree=etree.parse(str(publishedrecord)) root = tree[0] return root
def test_GetRecords_dataset(self): csw = CatalogueServiceWeb(service) constraints = [PropertyIsEqualTo("dc:type", "dataset")] csw.getrecords2(constraints=constraints, outputschema=GMD, startposition=1, maxrecords=5) nrecords = len(csw.records)
def test_GetRecords_summary(self): csw = CatalogueServiceWeb(service) csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5, esn="summary") nrecords = len(csw.records)
# <codecell> endpoint = 'http://www.nodc.noaa.gov/geoportal/csw' # NODC/UAF Geoportal: granule level csw = CatalogueServiceWeb(endpoint, timeout=60) print csw.version # <codecell> for oper in csw.operations: if oper.name == 'GetRecords': print '\nISO Queryables:\n', oper.constraints[ 'SupportedISOQueryables']['values'] # <codecell> csw.getrecords2(constraints=filt, maxrecords=1000, esn='full') len(csw.records.keys()) # <codecell> dap_urls = service_urls( csw.records, service_string='urn:x-esri:specification:ServiceType:OPeNDAP') len(dap_urls) # <codecell> bad_urls = [] for url in dap_urls: try: nc = netCDF4.Dataset(url) #print url
def getDataSetURI(anyText, CSWURL, BBox): """ Searches a given CSW server and returns metadata content for the datasets found. Arguments --------- - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.) - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.) - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional) """ csw = CatalogueServiceWeb(CSWURL, skip_caps=True) # FIXME: we should allow for "real" multiple keywords, # or change the API of anyText if that if that does not make sense in pygdp. # If the former we need `fes.And`, if the latter we need to not listfy `anyText`. if not anyText: constraints = [] else: constraints = [ fes.PropertyIsLike(propertyname='csw:AnyText', literal=literal) for literal in anyText ] csw.getrecords2(constraints=constraints, outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100) dataset_uris = [['title', 'abstract', ['urls']]] for rec in csw.records: title = csw.records[rec].identification.title abstract = csw.records[rec].identification.abstract urls = [] try: for onlineresource in range( len(csw.records[rec].distribution.online)): urls.append( csw.records[rec].distribution.online[onlineresource].url) except AttributeError: pass for ident in range(len(csw.records[rec].identificationinfo)): try: for operation in range( len(csw.records[rec].identificationinfo[ident]. operations)): urls.append(csw.records[rec].identificationinfo[ident]. operations[0]['connectpoint'][0].url) except AttributeError: pass entry = [title, abstract, urls] dataset_uris.append(entry) for i, dataset in enumerate(dataset_uris): dataset_uris[i][2] = [ uri.replace("https", "dods").replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2] ] return dataset_uris
class CSWHarvester(Harvester): """ OWSLib csw: https://geopython.github.io/OWSLib/#csw """ def __init__(self, community, url, schema, fromdate, clean, limit, outdir, verify): super().__init__(community, url, fromdate, clean, limit, outdir, verify) logging.captureWarnings(True) self.csw = CatalogueServiceWeb(self.url, auth=Authentication(verify=self.verify)) self._schema_type = schema self._schema = None self._constraints = None def identifier(self, record): return record.identifier def matches(self): self.csw.getrecords2(maxrecords=0, constraints=self.constraints, outputschema=self.schema) return self.csw.results['matches'] @property def schema(self): if not self._schema: if self._schema_type == SchemaType.ISO19139: ns_name = 'gmd' else: ns_name = 'csw' self._schema = Namespaces().get_namespace(ns_name) return self._schema @property def constraints(self): if not self._constraints: self._constraints = [] if self.fromdate: from_filter = fes.PropertyIsGreaterThan( 'dct:modified', self.fromdate) self._constraints.append(from_filter) return self._constraints def get_records(self): """get_records implementation with paging""" startposition = 0 ok = True while ok: self.csw.getrecords2(constraints=self.constraints, startposition=startposition, esn='full', outputschema=self.schema, maxrecords=20) startposition = self.csw.results['nextrecord'] if startposition == 0: ok = False for rec in self.csw.records: yield self.csw.records[rec] def _write_record(self, fp, record, pretty_print=True): xml = etree.tostring(etree.fromstring(record.xml), pretty_print=pretty_print).decode('utf8') fp.write(xml)
class CSW202Search(SearchBase): def __init__(self, url, timeout, username, password, auth): super().__init__(url, timeout, username, password, auth) self.type = CATALOG_TYPES[0] self.format = 'xml' self.service_info_template = 'csw_service_metadata.html' self.record_info_template = 'record_metadata_dc.html' self.constraints = [] self.conn = CatalogueServiceWeb( self.url, # spellok timeout=self.timeout, username=self.username, password=self.password, auth=self.auth) self.request = self.conn.request self.response = self.conn.response def query_records(self, bbox=[], keywords=None, limit=10, offset=1): self.constraints = [] # only apply spatial filter if bbox is not global # even for a global bbox, if a spatial filter is applied, then # the CSW server will skip records without a bbox if bbox and bbox != ['-180', '-90', '180', '90']: minx, miny, maxx, maxy = bbox self.constraints.append( BBox([miny, minx, maxy, maxx], crs='urn:ogc:def:crs:EPSG::4326')) # keywords if keywords: # TODO: handle multiple word searches self.constraints.append(PropertyIsLike('csw:AnyText', keywords)) if len(self.constraints) > 1: # exclusive search (a && b) self.constraints = [self.constraints] self.conn.getrecords2(constraints=self.constraints, maxrecords=limit, startposition=offset, esn='full') self.matches = self.conn.results['matches'] self.returned = self.conn.results['returned'] self.request = self.conn.request self.response = self.conn.response def records(self): recs = [] for record in self.conn.records: rec = { 'identifier': None, 'type': None, 'title': None, 'bbox': None } if self.conn.records[record].identifier: rec['identifier'] = self.conn.records[record].identifier if self.conn.records[record].type: rec['type'] = self.conn.records[record].type if self.conn.records[record].title: rec['title'] = self.conn.records[record].title if self.conn.records[record].bbox: rec['bbox'] = bbox_list_to_dict(self.conn.records[record].bbox) rec['links'] = (self.conn.records[record].uris + self.conn.records[record].references) recs.append(rec) return recs def get_record(self, identifier): self.conn.getrecordbyid([identifier]) return self.conn.records[identifier]
def mdcache(self, cswurl, constraints=[], maxrecords=10, maxharvest=20): """Fills the cache from a csw""" # cache directory for this csw if cswurl in self.cswlist: cswsig = self.cswlist[cswurl]["cswsig"] cswpath = os.path.join(self.cachepath, cswsig) else: cswsig = hashlib.md5(cswurl).hexdigest() logging.info("%s : new signature %s" % (cswurl, cswsig)) cswpath = os.path.join(self.cachepath, cswsig) if not (os.path.isdir(cswpath)): os.makedirs(cswpath) logging.info("%s %s created" % (cswurl, cswpath)) manifest = { "cswsig": cswsig, "cswurl": cswurl, "domains": { "organisationName": self.mdPropertyValues(cswurl, "organisationName").values()[0], "Subject": self.mdPropertyValues(cswurl, "Subject").values()[0], }, } f = open(os.path.join(cswpath, self.MANIFEST), "w") f.write(json.dumps(manifest)) f.close() logging.info("loading max %s md from %s" % (maxharvest, cswurl)) csw = CatalogueServiceWeb(cswurl, skip_caps=True) first = True nextrecord = 0 count = 0 while True: if not first: nextrecord = csw.results["nextrecord"] if count + maxrecords > maxharvest: maxrecords = maxharvest - count csw.getrecords2( esn="full", constraints=constraints, startposition=nextrecord, maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA, ) if csw.results["matches"] == 0: logging.info("0 md found from %s" % cswurl) break else: first = False # fetch records for rec_id, rec in csw.records.iteritems(): count += 1 logging.info(str(int(float(count) / min(maxharvest, csw.results["matches"]) * 100)) + "%") filename = os.path.join(cswpath, rec_id) os.path.join(filename) f = open(filename, "w") f.write(rec.xml) f.close() # break if no records, beyond maxrecords or matches if ( csw.results["nextrecord"] == 0 or csw.results["returned"] == 0 or csw.results["nextrecord"] > csw.results["matches"] or csw.results["nextrecord"] > maxharvest ): logging.info("%s md loaded from %s" % (count, cswurl)) break return cswpath
def main(): # Connection variables csw_url = 'csw.open.canada.ca/geonetwork/srv/csw' csw_user = None csw_passwd = None proxy_protocol = None proxy_url = None proxy_user = None proxy_passwd = None records_per_request = 10 # Or read from a .ini file harvester_file = 'config/harvester.ini' if os.path.isfile(harvester_file): from ConfigParser import ConfigParser ini_config = ConfigParser() ini_config.read(harvester_file) csw_url = ini_config.get('csw', 'url') # Get configuration options if ini_config.has_option('csw', 'username'): csw_user = ini_config.get('csw', 'username') csw_passwd = ini_config.get('csw', 'password') if ini_config.has_option('proxy', 'protocol'): proxy_protocol = ini_config.get('proxy', 'protocol') if ini_config.has_option('proxy', 'url'): proxy_url = ini_config.get('proxy', 'url') if ini_config.has_option('proxy', 'username'): proxy_user = ini_config.get('proxy', 'username') proxy_passwd = ini_config.get('proxy', 'password') if ini_config.has_option('processing', 'records_per_request'): records_per_request = int( ini_config.get('processing', 'records_per_request')) if ini_config.has_option('processing', 'start_date'): start_date = ini_config.get('processing', 'start_date') # If your supplying a proxy if proxy_url: # And your using authentication if proxy_user and proxy_passwd: password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, proxy_url, proxy_user, proxy_passwd) proxy_auth_handler = urllib2.ProxyBasicAuthHandler(password_mgr) # or even if your not else: proxy_auth_handler = urllib2.ProxyHandler( {proxy_protocol: proxy_url}) opener = urllib2.build_opener(proxy_auth_handler) urllib2.install_opener(opener) # Fetch the data # csw = CatalogueServiceWeb( # 'http://*****:*****@csw_url/geonetwork/srv/csw') if csw_user and csw_passwd: csw = CatalogueServiceWeb('http://' + csw_url, username=csw_user, password=csw_passwd, timeout=20) else: csw = CatalogueServiceWeb('http://' + csw_url, timeout=20) request_template = """<?xml version="1.0"?> <csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord" maxRecords="%d" startPosition="%d" > <csw:Query typeNames="gmd:MD_Metadata"> <csw:ElementSetName>full</csw:ElementSetName> <csw:Constraint version="1.1.0"> <Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"> <PropertyIsGreaterThanOrEqualTo> <PropertyName>Modified</PropertyName> <Literal>%s</Literal> </PropertyIsGreaterThanOrEqualTo> </Filter> </csw:Constraint> </csw:Query> </csw:GetRecords> """ # Is there a specified start date if arguments['-f']: start_date = arguments['-f'] active_page = 0 next_record = 1 request_another = True while request_another: request_another = False # Filter records into latest updates # # Sorry Tom K., we'll be more modern ASAWC. # For now it's good ol' Kitchen Sink # # from owslib.fes import PropertyIsGreaterThanOrEqualTo # modified = PropertyIsGreaterThanOrEqualTo( # 'apiso:Modified', # '2015-04-04' # ) # csw.getrecords2(constraints=[modified]) # # Kitchen Sink is the valid HNAP, we need HNAP for R1 to debug issues # This filter was supplied by EC, the CSW service technical lead current_request = request_template % (records_per_request, next_record, start_date) # (active_page*records_per_request)+1 csw.getrecords2(format='xml', xml=current_request) active_page += 1 # Identify if we need to continue this. records_root = ("/csw:GetRecordsResponse") # Read the file, should be a streamed input in the future root = etree.XML(csw.response) # Parse the root and itterate over each record records = fetchXMLArray(root, records_root) timestamp = fetchXMLAttribute(records[0], "csw:SearchStatus", "timestamp")[0] number_of_records_matched = int( fetchXMLAttribute(records[0], "csw:SearchResults", "numberOfRecordsMatched")[0]) number_of_records_returned = int( fetchXMLAttribute(records[0], "csw:SearchResults", "numberOfRecordsReturned")[0]) next_record = int( fetchXMLAttribute(records[0], "csw:SearchResults", "nextRecord")[0]) if next_record > number_of_records_matched or next_record == 0: pass else: request_another = True # When we move to Tom K's filter we can use results in an R2 unified # harvester # print csw.results # for rec in csw.records: # print '* '+csw.records[rec].title # Till then we need to collect and dump the response from the CSW # No use minimizing the XML to try to create a XML Lines file as the # data has carriage returns. # parser = etree.XMLParser(remove_blank_text=True) # elem = etree.XML(csw.response, parser=parser) # print etree.tostring(elem) # Output the harvested page print csw.response
# -*- coding: utf-8 -*- # <nbformat>3.0</nbformat> # <codecell> from owslib import fes from owslib.csw import CatalogueServiceWeb schemes = set() c = CatalogueServiceWeb("https://data.noaa.gov/csw", timeout=20) fil = fes.PropertyIsLike(propertyname='apiso:AnyText', literal="*sea_surface_height_above_sea_level*", wildCard='*') c.getrecords2(constraints=[fil], maxrecords=1000, esn='full') # <codecell> for record, item in c.records.items(): for d in item.references: schemes.add(d['scheme']) # <codecell> for scheme in schemes: print scheme # <codecell> type(schemes) # <codecell>
def create_services_from_endpoint(url, catalog, greedy_opt=True): """ Generate service/services from an endpoint. WMS, WMTS, TMS endpoints correspond to a single service. ESRI, CSW endpoints corrispond to many services. :return: imported, message """ # this variable will collect any exception message during the routine. # will be used in the last step to send a message if "detected" var is False. messages = [] num_created = 0 endpoint = get_sanitized_endpoint(url) try: urllib2.urlopen(endpoint, timeout=10) except Exception as e: message = traceback.format_exception(*sys.exc_info()) LOGGER.error('Cannot open this endpoint: %s' % endpoint) LOGGER.error('ERROR MESSAGE: %s' % message) LOGGER.error(e, exc_info=True) return False, message detected = False # handle specific service types for some domains (WorldMap, Wrapper...) parsed_uri = urlparse(endpoint) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if domain == 'http://worldmap.harvard.edu/': service_type = 'Hypermap:WorldMap' title = 'Harvard WorldMap' abstract = 'Harvard WorldMap' endpoint = domain detected = True if domain in [ 'http://maps.nypl.org/', 'http://mapwarper.net/', 'http://warp.worldmap.harvard.edu/', ]: service_type = 'Hypermap:WARPER' title = 'Warper at %s' % domain abstract = 'Warper at %s' % domain detected = True # test if it is CSW, WMS, TMS, WMTS or Esri # CSW try: csw = CatalogueServiceWeb(endpoint) service_type = 'OGC:CSW' service_links = {} detected = True typenames = 'csw:Record' outputschema = 'http://www.opengis.net/cat/csw/2.0.2' if 'csw_harvest_pagesize' in settings.REGISTRY_PYCSW['manager']: pagesize = int(settings.REGISTRY_PYCSW['manager']['csw_harvest_pagesize']) else: pagesize = 10 LOGGER.debug('Harvesting CSW %s' % endpoint) # now get all records # get total number of records to loop against try: csw.getrecords2(typenames=typenames, resulttype='hits', outputschema=outputschema) matches = csw.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(csw.response) if pagesize > matches: pagesize = matches LOGGER.info('Harvesting %d CSW records' % matches) # loop over all catalogue records incrementally for r in range(1, matches+1, pagesize): LOGGER.info('Parsing %s from %s' % (r, matches)) try: csw.getrecords2(typenames=typenames, startposition=r, maxrecords=pagesize, outputschema=outputschema, esn='full') except Exception as err: # this is a CSW, but server rejects query raise RuntimeError(csw.response) for k, v in csw.records.items(): # try to parse metadata try: LOGGER.info('Looking for service links') LOGGER.debug('Looking for service links via dct:references') if v.references: for ref in v.references: scheme = None if ref['scheme'] in [st[0] for st in SERVICE_TYPES]: if ref['url'] not in service_links: scheme = ref['scheme'] service_links[ref['url']] = scheme else: # loose detection scheme = detect_metadata_url_scheme(ref['url']) if scheme is not None: if ref['url'] not in service_links: service_links[ref['url']] = scheme if scheme is None: continue try: service = create_service_from_endpoint(ref['url'], scheme, catalog=catalog) if service is not None: num_created = num_created + 1 LOGGER.info('Found %s services on endpoint' % num_created) except Exception, e: LOGGER.error('Could not create service for %s : %s' % (scheme, ref['url'])) LOGGER.error(e, exc_info=True) LOGGER.debug('Looking for service links via the GeoNetwork-ish dc:URI') if v.uris: for u in v.uris: # loose detection scheme = detect_metadata_url_scheme(u['url']) if scheme is not None: if u['url'] not in service_links: service_links[u['url']] = scheme else: continue try: service = create_service_from_endpoint(u['url'], scheme, catalog=catalog) if service is not None: num_created = num_created + 1 LOGGER.info('Found %s services on endpoint' % num_created) except Exception, e: LOGGER.error('Could not create service for %s : %s' % (scheme, u['url'])) LOGGER.error(e, exc_info=True) except Exception as err: # parsing failed for some reason LOGGER.warning('Metadata parsing failed %s', err) LOGGER.error(err, exc_info=True)
def get_datasets_from_csw(csw_endpoint): csw = CatalogueServiceWeb('http://cida.usgs.gov/gdp/csw/') csw.getrecords2(esn='full', maxrecords=1000) return csw.records.values()
from owslib.csw import CatalogueServiceWeb cenia = CatalogueServiceWeb('http://geoportal.gov.cz/php/micka/csw/index.php') print (cenia.service) cenia.getrecords2() print (cenia.results) for rec in cenia.records: print (cenia.records[rec].title) from owslib.fes import PropertyIsLike, BBox, And, PropertyIsEqualTo wms_query = PropertyIsEqualTo('csw:AnyText', 'WMS') praha_query = BBox([14.22,49.94,14.71,50.18]) praha_and_wms = And([praha_query, wms_query]) cenia.getrecords2([praha_and_wms], esn='full') print (cenia.results) for recid in cenia.records: record = cenia.records[recid] print (u'{}: {} {} {} {}'.format(record.title, record.bbox.minx, record.bbox.miny, record.bbox.maxx, record.bbox.maxy)) zm_query = PropertyIsEqualTo('csw:AnyText', '%ZM10%') cenia.getrecords2([zm_query], esn='full') zm10 = cenia.records['CZ-CUZK-WMS-ZM10-P'] print (zm10.type) print (u'{}\n{}'.format(zm10.title, zm10.abstract)) url = zm10.references[0]['url']
class TestCSW(LiveServerTestCase): def setUp(self): self.script_name = '/registry/{}/csw'.format(catalog_test_slug) self.url = '{}{}'.format(self.live_server_url, self.script_name) self.username = '******' self.password = '******' self.client = Client() user = User.objects.create(username=self.username) user.set_password(self.password) user.save() self.client.login(username=self.username, password=self.password) settings.REGISTRY_PYCSW['server']['url'] = self.url Catalog.objects.get_or_create( name=catalog_test_slug ) Layer.objects.all().delete() Service.objects.all().delete() print "" print ">>> with env:" print "REGISTRY_SKIP_CELERY: %s" % settings.REGISTRY_SKIP_CELERY print "REGISTRY_LIMIT_LAYERS: %s" % settings.REGISTRY_LIMIT_LAYERS print "REGISTRY_SEARCH_URL: %s" % settings.REGISTRY_SEARCH_URL print "REGISTRY_HARVEST_SERVICES: %s" % settings.REGISTRY_HARVEST_SERVICES print "" # Post the 10 Layers contained in this file: data/cswt_insert.xml path = os.path.join(settings.PROJECT_DIR, "..", "data", "cswt_insert.xml") with open(path, 'rb') as ff: payload = ff.read() content_type = "application/xml" res = self.client.post(self.url, data=payload, content_type=content_type) self.assertEqual(res.status_code, 200) self.assertEqual(Layer.objects.all().count(), 10) def test_csw(self): # test 2.0.2 Basic Service Profile self.csw = CatalogueServiceWeb(self.url, version='2.0.2', username=self.username, password=self.password) self.assertEqual(self.csw.version, '2.0.2') self.assertIn('2.0.2', self.csw.parameters['version'].values) self.assertIn('3.0.0', self.csw.parameters['version'].values) for op in self.csw.operations: for method in op.methods: self.assertEqual(self.csw.url, method['url']) self.assertTrue('Transaction' in [o.name for o in self.csw.operations]) self.assertTrue('Harvest' in [o.name for o in self.csw.operations]) get_records_op = self.csw.get_operation_by_name('GetRecords') self.assertIn('application/json', get_records_op.parameters['outputFormat']['values']) # test basic search, no predicates self.csw.getrecords2() self.assertEqual(Layer.objects.all().count(), self.csw.results['matches']) # test csw:AnyText anytext = PropertyIsLike('csw:AnyText', 'Brasilia') self.csw.getrecords2(constraints=[anytext]) self.assertEqual(self.csw.results['matches'], 1) anytext = PropertyIsLike('csw:AnyText', 'roads') self.csw.getrecords2(constraints=[anytext]) self.assertEqual(self.csw.results['matches'], 4) # test ogc:BBOX bbox = BBox(['-13', '-80', '15', '-30']) self.csw.getrecords2(constraints=[bbox]) self.assertEqual(self.csw.results['matches'], 2) # test csw:AnyText OR ogc:BBOX self.csw.getrecords2(constraints=[anytext, bbox]) self.assertEqual(self.csw.results['matches'], 5) # test csw:AnyText && ogc:BBOX self.csw.getrecords2(constraints=[[anytext, bbox]]) self.assertEqual(self.csw.results['matches'], 1) # test that ElementSetName=full stores full metadata record as inserted self.csw.getrecords2(esn='full') self.assertIn('xmlns:registry="http://gis.harvard.edu/HHypermap/registry/0.1"', self.csw.response) # test JSON output # TODO: fix owslib.csw.CatalogueServiceWeb.getrecords2 to handle non-XML request/response with self.assertRaises(XMLSyntaxError): self.csw.getrecords2(constraints=[anytext, bbox], format='application/json') records_json = json.loads(self.csw.response) self.assertEqual(records_json['csw:GetRecordsResponse']['csw:SearchResults']['@numberOfRecordsMatched'], '5') # test 3.0.0 OpenSearch bsp = { 'mode': 'opensearch', 'service': 'CSW', 'version': '3.0.0', 'request': 'GetRecords', 'typenames': 'csw:Record', 'elementsetname': 'full', 'outputformat': 'application/json' } # test basic search, no predicates res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '10') # test q bsp['q'] = 'Brasilia' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') bsp.pop('q') # test bbox bsp['bbox'] = '-80,-13,-30,15' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '2') bsp.pop('bbox') # test time bsp['time'] = '2014-09-23T12:04:31.102243+00:00/' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '10') bsp.pop('time') # test q and bbox bsp['q'] = 'roads' bsp['bbox'] = '-80,-13,-30,15' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') # test q and bbox and time bsp['time'] = '2014-09-23T12:04:31.102243+00:00/' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') @classmethod def tearDownClass(cls): # Workaround for https://code.djangoproject.com/ticket/22414 # Persistent connections not closed by LiveServerTestCase, preventing dropping test databases # https://github.com/cjerdonek/django/commit/b07fbca02688a0f8eb159f0dde132e7498aa40cc def close_sessions(conn): close_sessions_query = """ SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid(); """ with conn.cursor() as cursor: try: cursor.execute(close_sessions_query) except OperationalError: # We get kicked out after closing. pass for alias in connections: connections[alias].close() close_sessions(connections[alias]) print "Forcefully closed database connections."
# #### Query each CSW catalog for the cf_name_filters constructed above # <codecell> from owslib.csw import CatalogueServiceWeb from utilities import normalize_service_urn var_results = [] for x in range(len(cf_name_filters)): var_name = variables_to_query[x] single_var_filter = cf_name_filters[x] for url in known_csw_servers: try: csw = CatalogueServiceWeb(url, timeout=20) csw.getrecords2(constraints=[single_var_filter], maxrecords=1000, esn='full') for record, item in csw.records.items(): for d in item.references: result = dict(variable=var_name, scheme=normalize_service_urn(d['scheme']), url=d['url'], server=url, title=record.title()) var_results.append(result) except BaseException, e: print "- FAILED: %s - %s" % (url, e) # <markdowncell> # <div class="error"><strong>Paginating CSW Records</strong> - Some servers have a maximum amount of records you can retrieve at once. See: https://github.com/ioos/system-test/issues/126</div>
print('GetDomain not supported') # <codecell> box=[-72.0, 41.0, -69.0, 43.0] # gulf of maine # <codecell> val = 'salinity' filter1 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val), escapeChar='\\',wildCard='*',singleChar='?') filter_list = [ filter1 ] # <codecell> csw.getrecords2(constraints=filter_list,maxrecords=1000,esn='full') len(csw.records.keys()) # <codecell> for rec,item in csw.records.iteritems(): print item.title # <codecell> choice=np.random.choice(list(csw.records.keys())) print csw.records[choice].title csw.records[choice].references # <codecell>
def create_services_from_endpoint(url, catalog, greedy_opt=True): """ Generate service/services from an endpoint. WMS, WMTS, TMS endpoints correspond to a single service. ESRI, CSW endpoints corrispond to many services. :return: imported, message """ # this variable will collect any exception message during the routine. # will be used in the last step to send a message if "detected" var is False. messages = [] num_created = 0 endpoint = get_sanitized_endpoint(url) try: urllib.request.urlopen(endpoint, timeout=10) except Exception as e: message = traceback.format_exception(*sys.exc_info()) LOGGER.error('Cannot open this endpoint: %s' % endpoint) LOGGER.error('ERROR MESSAGE: %s' % message) LOGGER.error(e, exc_info=True) return False, message detected = False # handle specific service types for some domains (WorldMap, Wrapper...) parsed_uri = urlparse(endpoint) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if domain == 'http://worldmap.harvard.edu/': service_type = 'Hypermap:WorldMap' title = 'Harvard WorldMap' abstract = 'Harvard WorldMap' endpoint = domain detected = True if domain in [ 'http://maps.nypl.org/', 'http://mapwarper.net/', 'http://warp.worldmap.harvard.edu/', ]: service_type = 'Hypermap:WARPER' title = 'Warper at %s' % domain abstract = 'Warper at %s' % domain detected = True # test if it is CSW, WMS, TMS, WMTS or Esri # CSW try: csw = CatalogueServiceWeb(endpoint) service_type = 'OGC:CSW' service_links = {} detected = True typenames = 'csw:Record' outputschema = 'http://www.opengis.net/cat/csw/2.0.2' if 'csw_harvest_pagesize' in settings.REGISTRY_PYCSW['manager']: pagesize = int( settings.REGISTRY_PYCSW['manager']['csw_harvest_pagesize']) else: pagesize = 10 LOGGER.debug('Harvesting CSW %s' % endpoint) # now get all records # get total number of records to loop against try: csw.getrecords2(typenames=typenames, resulttype='hits', outputschema=outputschema) matches = csw.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(csw.response) if pagesize > matches: pagesize = matches LOGGER.info('Harvesting %d CSW records' % matches) # loop over all catalogue records incrementally for r in range(1, matches + 1, pagesize): LOGGER.info('Parsing %s from %s' % (r, matches)) try: csw.getrecords2(typenames=typenames, startposition=r, maxrecords=pagesize, outputschema=outputschema, esn='full') except Exception as err: # this is a CSW, but server rejects query raise RuntimeError(csw.response) for k, v in list(csw.records.items()): # try to parse metadata try: LOGGER.info('Looking for service links') LOGGER.debug( 'Looking for service links via dct:references') if v.references: for ref in v.references: scheme = None if ref['scheme'] in [ st[0] for st in SERVICE_TYPES ]: if ref['url'] not in service_links: scheme = ref['scheme'] service_links[ref['url']] = scheme else: # loose detection scheme = detect_metadata_url_scheme(ref['url']) if scheme is not None: if ref['url'] not in service_links: service_links[ref['url']] = scheme if scheme is None: continue try: service = create_service_from_endpoint( ref['url'], scheme, catalog=catalog) if service is not None: num_created = num_created + 1 LOGGER.info( 'Found %s services on endpoint' % num_created) except Exception as e: LOGGER.error( 'Could not create service for %s : %s' % (scheme, ref['url'])) LOGGER.error(e, exc_info=True) LOGGER.debug( 'Looking for service links via the GeoNetwork-ish dc:URI' ) if v.uris: for u in v.uris: # loose detection scheme = detect_metadata_url_scheme(u['url']) if scheme is not None: if u['url'] not in service_links: service_links[u['url']] = scheme else: continue try: service = create_service_from_endpoint( u['url'], scheme, catalog=catalog) if service is not None: num_created = num_created + 1 LOGGER.info( 'Found %s services on endpoint' % num_created) except Exception as e: LOGGER.error( 'Could not create service for %s : %s' % (scheme, u['url'])) LOGGER.error(e, exc_info=True) except Exception as err: # parsing failed for some reason LOGGER.warning('Metadata parsing failed %s', err) LOGGER.error(err, exc_info=True) except XMLSyntaxError as e: # This is not XML, so likely not a CSW. Moving on. pass except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) # WMS if not detected: try: service = get_wms_version_negotiate(endpoint, timeout=10) service_type = 'OGC:WMS' title = service.identification.title, abstract = service.identification.abstract detected = True except XMLSyntaxError as e: # This is not XML, so likely not a WMS. Moving on. pass except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) # TMS if not detected: try: service = TileMapService(endpoint, timeout=10) service_type = 'OSGeo:TMS' title = service.identification.title, abstract = service.identification.abstract detected = True except XMLSyntaxError as e: # This is not XML, so likely not a TsMS. Moving on. pass except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) # WMTS if not detected: try: # @tomkralidis timeout is not implemented for WebMapTileService? service = WebMapTileService(endpoint) service_type = 'OGC:WMTS' title = service.identification.title, abstract = service.identification.abstract detected = True except XMLSyntaxError as e: # This is not XML, so likely not a WMTS. Moving on. pass except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) # if detected, let's create the service if detected and service_type != 'OGC:CSW': try: service = create_service_from_endpoint(endpoint, service_type, title, abstract=abstract, catalog=catalog) if service is not None: num_created = num_created + 1 except XMLSyntaxError as e: # This is not XML, so likely not a OGC:CSW. Moving on. pass except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) # Esri # a good sample is here: https://gis.ngdc.noaa.gov/arcgis/rest/services # we can safely assume the following condition (at least it is true for 1170 services) # we need to test this as arcrest.Folder can freeze with not esri url such as this one: # http://hh.worldmap.harvard.edu/admin/aggregator/service/?q=%2Frest%2Fservices if '/rest/services' in endpoint: if not detected: try: esri = arcrest.Folder(endpoint) service_type = 'ESRI' detected = True service_to_process, folder_to_process = esri.services, esri.folders if not greedy_opt: folder_to_process = [] sections = service_url_parse(url) service_to_process = get_single_service(esri, sections) processed_services = process_esri_services( service_to_process, catalog) num_created = num_created + len(processed_services) for folder in folder_to_process: folder_services = process_esri_services( folder.services, catalog) num_created = num_created + len(folder_services) except Exception as e: LOGGER.error(e, exc_info=True) messages.append(str(e)) if detected: return True, '%s service/s created' % num_created else: m = '|'.join(messages) return False, 'ERROR! Could not detect service type for ' \ 'endpoint %s or already existing. messages=(%s)' % (endpoint, m)
# #### Query each CSW catalog for revery model_name_filter constructed above # <codecell> from owslib.csw import CatalogueServiceWeb model_results = [] for x in range(len(model_name_filters)): model_name = known_model_strings[x] single_model_filter = model_name_filters[x] for url in known_csw_servers: try: csw = CatalogueServiceWeb(url, timeout=20) csw.getrecords2(constraints=[single_model_filter], maxrecords=1000, esn='full') for record, item in csw.records.items(): for d in item.references: result = dict(model=model_name, scheme=d['scheme'], url=d['url'], server=url) model_results.append(result) except BaseException as e: print "- FAILED: %s - %s" % (url, e.msg) # <markdowncell> # <div class="error"><strong>Paginating CSW Records</strong> - Some servers have a maximum amount of records you can retrieve at once. See: https://github.com/ioos/system-test/issues/126</div> # <markdowncell>
class CswScanner: def __init__(self): self.napids = [] self.start_pos = 0 # Get the CSW URL, Username and Password ini_config = ConfigParser() ini_config.read('harvester.ini') csw_url = ini_config.get('csw', 'csw.url') csw_user = ini_config.get('csw', 'csw.username') csw_passwd = ini_config.get('csw', 'csw.password') if csw_user and csw_passwd: self.csw = CatalogueServiceWeb(csw_url, username=csw_user, password=csw_passwd, timeout=20) else: self.csw = CatalogueServiceWeb(csw_url, timeout=20) #### The since date is currently being ignored. def get_all_ids(self, since=None): while True: if since is not None: scan_date = since.strftime('%Y-%m-%d') since_query = PropertyIsGreaterThanOrEqualTo( 'Modified', scan_date) self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata', constraints=[since_query]) else: #self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata') self.csw.getrecords2( xml= '<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord"><csw:Query typeNames="gmd:MD_Metadata"><csw:Constraint version="1.1.0"><Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"/></csw:Constraint></csw:Query></csw:GetRecords>' ) if self.csw.results['returned'] == 0: break print '{0}Found {1}{2}{3} records'.format( Fore.GREEN, Fore.BLUE, self.csw.results['matches'], Fore.GREEN) print '{0}Next record: {1}{2}'.format( Fore.GREEN, Fore.BLUE, self.csw.results['nextrecord']) self.start_pos = self.csw.results['nextrecord'] for rec in self.csw.records: try: print u'{0}{1}{2}: {3}'.format( Fore.RED, rec, Fore.CYAN, self.csw.records[rec].title.decode('utf-8')) except UnicodeEncodeError: print u'{0}Unprintable title for {1}{2}'.format( Fore.GREEN, Fore.RED, rec) self.napids.append(rec) def load_naps(self): ns = Namespaces() gmd = ns.get_namespace('gmd') session = connect_to_database() for napid in self.napids: print '{0}Full NAP Record for {1}{2}'.format( Fore.GREEN, Fore.CYAN, napid) self.csw.getrecordbyid(id=[napid], outputschema=gmd) ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord) if ec_rec is None: ec_rec = ECRecord( uuid=self.csw.records[napid].identifier, title=self.csw.records[napid].identification.title, state='active', nap_record=self.csw.records[napid].xml, csw_scanned=datetime.now().isoformat()) else: ec_rec.title = self.csw.records[napid].identification.title, ec_rec.state = 'active', ec_rec.nap_record = self.csw.records[napid].xml, ec_rec.csw_scanned = datetime.now().isoformat() add_record(session, ec_rec) session.close_all()
# print '\n' + var_name.upper() num_recs = [] for location, bounding_box in locations.iteritems(): # print location bbox = fes.BBox(bounding_box) #use the search name to create search filter or_filt = fes.Or([fes.PropertyIsLike(propertyname='apiso:AnyText', literal='*%s*' % val, escapeChar='\\', wildCard='*', singleChar='?') for val in names_dict[var_name]["names"]]) filter_list = [fes.And([ bbox, or_filt])] # try request using multiple filters "and" syntax: [[filter1,filter2]] try: csw.getrecords2(constraints=filter_list, maxrecords=max_records, resulttype='hits') except Exception as e: print '\t' + 'ERROR - ' + str(e) num_recs.append(np.NaN) else: # print csw.results['matches'] num_recs.append(csw.results['matches']) results[var_name] = np.array(num_recs) # Save the results prod = list(itertools.product([endpoint], locations.keys())) mi = pd.MultiIndex.from_tuples(prod, names=['endpoint', 'location']) all_data.append(pd.DataFrame(results, index=mi)) # Update progress bar
class TestCSW(LiveServerTestCase): def setUp(self): self.script_name = '/registry/{}/csw'.format(catalog_test_slug) self.url = '{}{}'.format(self.live_server_url, self.script_name) self.username = '******' self.password = '******' self.client = Client() user = User.objects.create(username=self.username) user.set_password(self.password) user.save() self.client.login(username=self.username, password=self.password) settings.REGISTRY_PYCSW['server']['url'] = self.url Catalog.objects.get_or_create(name=catalog_test_slug) Layer.objects.all().delete() Service.objects.all().delete() print "" print ">>> with env:" print "REGISTRY_SKIP_CELERY: %s" % settings.REGISTRY_SKIP_CELERY print "REGISTRY_LIMIT_LAYERS: %s" % settings.REGISTRY_LIMIT_LAYERS print "REGISTRY_CHECK_PERIOD: %s" % settings.REGISTRY_CHECK_PERIOD print "REGISTRY_SEARCH_URL: %s" % settings.REGISTRY_SEARCH_URL print "REGISTRY_HARVEST_SERVICES: %s" % settings.REGISTRY_HARVEST_SERVICES print "" # Post the 10 Layers contained in this file: data/cswt_insert.xml path = os.path.join(settings.PROJECT_DIR, "..", "data", "cswt_insert.xml") with open(path, 'rb') as ff: payload = ff.read() content_type = "application/xml" res = self.client.post(self.url, data=payload, content_type=content_type) self.assertEqual(res.status_code, 200) self.assertEqual(Layer.objects.all().count(), 10) def test_csw(self): # test 2.0.2 Basic Service Profile self.csw = CatalogueServiceWeb(self.url, version='2.0.2', username=self.username, password=self.password) self.assertEqual(self.csw.version, '2.0.2') self.assertIn('2.0.2', self.csw.parameters['version'].values) self.assertIn('3.0.0', self.csw.parameters['version'].values) for op in self.csw.operations: for method in op.methods: self.assertEqual(self.csw.url, method['url']) self.assertTrue('Transaction' in [o.name for o in self.csw.operations]) self.assertTrue('Harvest' in [o.name for o in self.csw.operations]) get_records_op = self.csw.get_operation_by_name('GetRecords') self.assertIn('application/json', get_records_op.parameters['outputFormat']['values']) # test basic search, no predicates self.csw.getrecords2() self.assertEqual(Layer.objects.all().count(), self.csw.results['matches']) # test csw:AnyText anytext = PropertyIsLike('csw:AnyText', 'Brasilia') self.csw.getrecords2(constraints=[anytext]) self.assertEqual(self.csw.results['matches'], 1) anytext = PropertyIsLike('csw:AnyText', 'roads') self.csw.getrecords2(constraints=[anytext]) self.assertEqual(self.csw.results['matches'], 4) # test ogc:BBOX bbox = BBox(['-13', '-80', '15', '-30']) self.csw.getrecords2(constraints=[bbox]) self.assertEqual(self.csw.results['matches'], 2) # test csw:AnyText OR ogc:BBOX self.csw.getrecords2(constraints=[anytext, bbox]) self.assertEqual(self.csw.results['matches'], 5) # test csw:AnyText && ogc:BBOX self.csw.getrecords2(constraints=[[anytext, bbox]]) self.assertEqual(self.csw.results['matches'], 1) # test that ElementSetName=full stores full metadata record as inserted self.csw.getrecords2(esn='full') self.assertIn( 'xmlns:registry="http://gis.harvard.edu/HHypermap/registry/0.1"', self.csw.response) # test JSON output # TODO: fix owslib.csw.CatalogueServiceWeb.getrecords2 to handle non-XML request/response with self.assertRaises(XMLSyntaxError): self.csw.getrecords2(constraints=[anytext, bbox], format='application/json') records_json = json.loads(self.csw.response) self.assertEqual( records_json['csw:GetRecordsResponse']['csw:SearchResults'] ['@numberOfRecordsMatched'], '5') # test 3.0.0 OpenSearch bsp = { 'mode': 'opensearch', 'service': 'CSW', 'version': '3.0.0', 'request': 'GetRecords', 'typenames': 'csw:Record', 'elementsetname': 'full', 'outputformat': 'application/json' } # test basic search, no predicates res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '10') # test q bsp['q'] = 'Brasilia' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') bsp.pop('q') # test bbox bsp['bbox'] = '-80,-13,-30,15' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '2') bsp.pop('bbox') # test time bsp['time'] = '2014-09-23T12:04:31.102243+00:00/' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '10') bsp.pop('time') # test q and bbox bsp['q'] = 'roads' bsp['bbox'] = '-80,-13,-30,15' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') # test q and bbox and time bsp['time'] = '2014-09-23T12:04:31.102243+00:00/' res = json.loads(self.client.get(self.script_name, bsp).content) self.assertEqual(res['atom:feed']['os:totalResults'], '1') @classmethod def tearDownClass(cls): # Workaround for https://code.djangoproject.com/ticket/22414 # Persistent connections not closed by LiveServerTestCase, preventing dropping test databases # https://github.com/cjerdonek/django/commit/b07fbca02688a0f8eb159f0dde132e7498aa40cc def close_sessions(conn): close_sessions_query = """ SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid(); """ with conn.cursor() as cursor: try: cursor.execute(close_sessions_query) except OperationalError: # We get kicked out after closing. pass for alias in connections: connections[alias].close() close_sessions(connections[alias]) print "Forcefully closed database connections."
def test_csw_geoserver(): c = CatalogueServiceWeb(SERVICE_URL) c.getrecords2(typenames='csw:Record') assert c.results.get('returned') > 0 assert c.results.get('nextrecord') > 0 assert c.results.get('matches') > 0
def create_services_from_endpoint(url): """ Generate service/services from an endpoint. WMS, WMTS, TMS endpoints correspond to a single service. ESRI, CSW endpoints corrispond to many services. """ num_created = 0 endpoint = get_sanitized_endpoint(url) try: urllib2.urlopen(endpoint, timeout=10) except Exception as e: print 'ERROR! Cannot open this endpoint: %s' % endpoint message = traceback.format_exception(*sys.exc_info()) return False, message detected = False # handle specific service types for some domains (WorldMap, Wrapper...) parsed_uri = urlparse(endpoint) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if domain == 'http://worldmap.harvard.edu/': service_type = 'Hypermap:WorldMap' title = 'Harvard WorldMap' abstract = 'Harvard WorldMap' endpoint = domain detected = True if domain in [ 'http://maps.nypl.org/', 'http://mapwarper.net/', 'http://warp.worldmap.harvard.edu/', ]: service_type = 'Hypermap:WARPER' title = 'Warper at %s' % domain abstract = 'Warper at %s' % domain detected = True # test if it is CSW, WMS, TMS, WMTS or Esri # CSW try: csw = CatalogueServiceWeb(endpoint) service_type = 'OGC:CSW' service_links = {} detected = True typenames = 'csw:Record' outputschema = 'http://www.opengis.net/cat/csw/2.0.2' if 'csw_harvest_pagesize' in settings.PYCSW['manager']: pagesize = int(settings.PYCSW['manager']['csw_harvest_pagesize']) else: pagesize = 10 print 'Harvesting CSW %s' % endpoint # now get all records # get total number of records to loop against try: csw.getrecords2(typenames=typenames, resulttype='hits', outputschema=outputschema) matches = csw.results['matches'] except: # this is a CSW, but server rejects query raise RuntimeError(csw.response) if pagesize > matches: pagesize = matches print 'Harvesting %d CSW records' % matches # loop over all catalogue records incrementally for r in range(1, matches+1, pagesize): try: csw.getrecords2(typenames=typenames, startposition=r, maxrecords=pagesize, outputschema=outputschema, esn='full') except Exception as err: # this is a CSW, but server rejects query raise RuntimeError(csw.response) for k, v in csw.records.items(): # try to parse metadata try: LOGGER.info('Looking for service links') if v.references: # not empty for ref in v.references: if ref['scheme'] in [st[0] for st in SERVICE_TYPES]: if ref['url'] not in service_links: service_links[ref['url']] = ref['scheme'] except Exception as err: # parsing failed for some reason LOGGER.warning('Metadata parsing failed %s', err) LOGGER.info('Service links found: %s', service_links) for k, v in service_links.items(): try: service = create_service_from_endpoint(k, v) if service is not None: num_created = num_created + 1 except Exception as err: raise RuntimeError('HHypermap error: %s' % err) except Exception as e: print str(e) # WMS if not detected: try: service = WebMapService(endpoint, timeout=10) service_type = 'OGC:WMS' title = service.identification.title, abstract = service.identification.abstract detected = True except Exception as e: print str(e) # TMS if not detected: try: service = TileMapService(endpoint, timeout=10) service_type = 'OSGeo:TMS' title = service.identification.title, abstract = service.identification.abstract detected = True except Exception as e: print str(e) # WMTS if not detected: try: # @tomkralidis timeout is not implemented for WebMapTileService? service = WebMapTileService(endpoint) service_type = 'OGC:WMTS' title = service.identification.title, abstract = service.identification.abstract detected = True except Exception as e: print str(e) # if detected, let's create the service if detected and service_type != 'OGC:CSW': try: service = create_service_from_endpoint( endpoint, service_type, title, abstract=abstract ) if service is not None: num_created = num_created + 1 except Exception as e: print str(e) # Esri # a good sample is here: https://gis.ngdc.noaa.gov/arcgis/rest/services # we can safely assume the following condition (at least it is true for 1170 services) # we need to test this as ArcFolder can freeze with not esri url such as this one: # http://hh.worldmap.harvard.edu/admin/aggregator/service/?q=%2Frest%2Fservices if '/rest/services' in endpoint: if not detected: try: esri = ArcFolder(endpoint) services = esri.services service_type = 'ESRI' detected = True # root root_services = process_esri_services(services) num_created = num_created + len(root_services) # folders for folder in esri.folders: folder_services = process_esri_services(folder.services) num_created = num_created + len(folder_services) except Exception as e: print str(e) if detected: return True, '%s service/s created' % num_created else: return False, 'ERROR! Could not detect service type for endpoint %s or already existing' % endpoint
class CSWRepository(HarvestRepository): """ CSW Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "csw" super(CSWRepository, self).setRepoParams(repoParams) try: self.cswrepo = CatalogueServiceWeb(self.url) except: self.cswrepo = None self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "csw", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url } self.repository_id = self.db.update_repo(**kwargs) if self.cswrepo is None: self.logger.error("Could not initiate this repo to crawl it") return item_count = 0 while True: try: self.cswrepo.getrecords2(startposition=self.cswrepo.results['nextrecord']) except: self.cswrepo.getrecords2() for rec in self.cswrepo.records: result = self.db.write_header(self.cswrepo.records[rec].identifier, self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize( tdelta), item_count / tdelta)) if item_count == self.cswrepo.results['matches']: break self.logger.info("Found {} items in feed".format(item_count)) def format_csw_to_oai(self, csw_record, local_identifier): record = {} record["title"] = csw_record.title record["description"] = csw_record.abstract record["tags"] = csw_record.subjects record["identifier"] = local_identifier record["creator"] = self.name record["contact"] = self.contact record["series"] = "" return record def _rate_limited(max_per_second): """ Decorator that make functions not be called faster than a set rate """ threading = __import__('threading') lock = threading.Lock() min_interval = 1.0 / float(max_per_second) def decorate(func): last_time_called = [0.0] @wraps(func) def rate_limited_function(*args, **kwargs): lock.acquire() elapsed = time.clock() - last_time_called[0] left_to_wait = min_interval - elapsed if left_to_wait > 0: time.sleep(left_to_wait) lock.release() ret = func(*args, **kwargs) last_time_called[0] = time.clock() return ret return rate_limited_function return decorate @_rate_limited(5) def _update_record(self, record): if self.cswrepo is None: return self.cswrepo.getrecordbyid(id=[record['local_identifier']]) if self.cswrepo.records: csw_record = self.cswrepo.records[record['local_identifier']] oai_record = self.format_csw_to_oai(csw_record, record['local_identifier']) # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific self.cswrepo.getrecordbyid(id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd") oai_record["pub_date"] = self.cswrepo.records[record['local_identifier']].datestamp oai_record["pub_date"] = re.sub("[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "", oai_record["pub_date"]) if oai_record: self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata) return True else: # This record was deleted self.db.delete_record(record) return True return False
print(csw.results['values']) except: print('GetDomain not supported') # In[5]: val = 'laterite' filter1 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val), escapeChar='\\',wildCard='*',singleChar='?') filter_list = [ filter1 ] # In[6]: csw.getrecords2(constraints=filter_list,maxrecords=100,esn='full') print len(csw.records.keys()) for rec in list(csw.records.keys()): print csw.records[rec].title # In[7]: choice=np.random.choice(list(csw.records.keys())) print(csw.records[choice].title) csw.records[choice].references # In[8]:
jd_stop = dt.datetime.strptime(stop_date, "%Y-%m-%d %H:%M") print start_date, "to", stop_date # <codecell> start, stop = dateRange(start_date, stop_date) filter1 = fes.PropertyIsLike( propertyname="apiso:AnyText", literal=("*%s*" % val), escapeChar="\\", wildCard="*", singleChar="?" ) bbox = fes.BBox(box, crs="urn:ogc:def:crs:OGC:1.3:CRS84") # <codecell> filter_list = [fes.And([bbox, filter1])] csw.getrecords2(constraints=filter_list) csw.results["matches"] # <codecell> filter_list = [fes.And([bbox, filter1, start, stop])] csw.getrecords2(constraints=filter_list) csw.results["matches"] # <codecell> startposition = 0 maxrecords = 20 while True: print "getting records %d to %d" % (startposition, startposition + pagesize)