def delete_records(context, database, table): """Deletes all records from repository""" LOGGER.info('Deleting all records') repo = repository.Repository(database, context, table=table) repo.delete(constraint={'where': '', 'values': []})
def delete_records(self, constraint={'where': '', 'values': []}): if self.context == None: print("[WARN] pycsw config not available. NOT DELETING RECORDS") return repo = repository.Repository(self.database, self.context, table=self.table) repo.delete(constraint=constraint)
def rebuild_db_indexes(context, database, table): """Rebuild database indexes""" LOGGER.info('Rebuilding database %s, table %s', database, table) repos = repository.Repository(database, context, table=table) connection = repos.engine.connect() connection.autocommit = True connection.execute('REINDEX %s' % table) connection.close() LOGGER.info('Done')
def export_records(context, database, table, xml_dirpath): """Export metadata records from database to directory of files""" repo = repository.Repository(database, context, table=table) LOGGER.info('Querying database %s, table %s ....', database, table) records = repo.session.query(repo.dataset) LOGGER.info('Found %d records\n', records.count()) LOGGER.info('Exporting records\n') dirpath = os.path.abspath(xml_dirpath) exported_files = set() if not os.path.exists(dirpath): LOGGER.info('Directory %s does not exist. Creating...', dirpath) try: os.makedirs(dirpath) except OSError as err: LOGGER.exception('Could not create directory') raise RuntimeError('Could not create %s %s' % (dirpath, err)) from err for record in records.all(): identifier = \ getattr(record, context.md_core_model['mappings']['pycsw:Identifier']) LOGGER.info('Processing %s', identifier) # sanitize identifier identifier = util.secure_filename(identifier) # write to XML document filename = os.path.join(dirpath, '%s.xml' % identifier) try: LOGGER.info('Writing to file %s', filename) if hasattr(record.xml, 'decode'): str_xml = record.xml.decode('utf-8') else: str_xml = record.xml with open(filename, 'w') as xml: xml.write('<?xml version="1.0" encoding="UTF-8"?>\n') xml.write(str_xml) except Exception as err: # Something went wrong so skip over this file but log an error LOGGER.exception('Error writing %s to disk', filename) # If we wrote a partial file or created an empty file make sure it is removed if os.path.exists(filename): os.remove(filename) continue else: exported_files.add(filename) return tuple(exported_files)
def export_record_table_csv(context, database, table, mappings, xml_dirpath): """Export record table from database to csv file""" import csv from datetime import datetime repo = repository.Repository(database, context, table=table) LOGGER.info('Querying database %s, table %s ....', database, table) records = repo.session.query(repo.dataset) LOGGER.info('Found %d records\n', records.count()) LOGGER.info('Exporting records\n') dirpath = os.path.abspath(xml_dirpath) if not os.path.exists(dirpath): LOGGER.info('Directory %s does not exist. Creating...', dirpath) try: os.makedirs(dirpath) except OSError as err: raise RuntimeError('Could not create %s %s' % (dirpath, err)) is_headers_written = False filename = os.path.join(dirpath, '%s.csv' % str(datetime.now())) with open(filename, 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for record in records.all(): if mappings is None: model = context.md_core_model else: model = import_model_from_file(mappings) headers = [] row = [] map_dict = model['mappings'] for k, v in map_dict.iteritems(): field_value = \ getattr(record, model['mappings'][k], "") if is_headers_written is False: headers.append(model['mappings'][k]) row.append(field_value) if is_headers_written is False: csvwriter.writerow(headers) csvwriter.writerow(row) is_headers_written = True
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" repo = repository.Repository(database, context, table=table) file_list = [] if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except Exception as err: LOGGER.warn('XML document is not well-formed: %s', str(err)) continue record = metadata.parse_record(context, exml, repo) for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) LOGGER.info('Inserted') except RuntimeError as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated') else: LOGGER.warn('ERROR: not inserted %s', err)
def __init__(self, repository_database_uri, ows_url: str = '', public_s3_url: str = ''): self.collections = [] self.ows_url = ows_url self.public_s3_url = public_s3_url logger.debug('Setting up static context') self.context = pycsw.core.config.StaticContext() logger.debug('Initializing pycsw repository') self.repo = repository.Repository(repository_database_uri, self.context, table='records') logger.debug('Loading collection level metadata identifiers') for clm in os.listdir(COLLECTION_LEVEL_METADATA): self.collections.append(os.path.splitext(clm)[0])
def export_records(context, database, table, mappings, xml_dirpath): """Export metadata records from database to directory of files""" repo = repository.Repository(database, context, table=table) LOGGER.info('Querying database %s, table %s ....', database, table) records = repo.session.query(repo.dataset) LOGGER.info('Found %d records\n', records.count()) LOGGER.info('Exporting records\n') dirpath = os.path.abspath(xml_dirpath) if not os.path.exists(dirpath): LOGGER.info('Directory %s does not exist. Creating...', dirpath) try: os.makedirs(dirpath) except OSError as err: raise RuntimeError('Could not create %s %s' % (dirpath, err)) for record in records.all(): if mappings is None: model = context.md_core_model else: model = import_model_from_file(mappings) identifier = \ getattr(record, model['mappings']['pycsw:Identifier']) xml_field = \ getattr(record, model['mappings']['pycsw:XML']) LOGGER.info('Processing %s', identifier) if identifier.find(':') != -1: # it's a URN # sanitize identifier LOGGER.info(' Sanitizing identifier') identifier = identifier.split(':')[-1] # write to XML document filename = os.path.join(dirpath, '%s.xml' % identifier) try: LOGGER.info('Writing to file %s', filename) with open(filename, 'w') as xml: xml.write('<?xml version="1.0" encoding="UTF-8"?>\n') xml.write(xml_field) except Exception as err: raise RuntimeError("Error writing to %s" % filename, err)
def optimize_db(context, database, table): """Optimize database""" from sqlalchemy.exc import ArgumentError, OperationalError LOGGER.info('Optimizing database %s', database) repos = repository.Repository(database, context, table=table) connection = repos.engine.connect() try: # PostgreSQL connection.execution_options(isolation_level="AUTOCOMMIT") connection.execute('VACUUM ANALYZE') except (ArgumentError, OperationalError): # SQLite connection.autocommit = True connection.execute('VACUUM') connection.execute('ANALYZE') finally: connection.close() LOGGER.info('Done')
def gen_sitemap(context, database, table, url, output_file): """generate an XML sitemap from all records in repository""" # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # write out sitemap document urlset = etree.Element(util.nspath_eval('sitemap:urlset', context.namespaces), nsmap=context.namespaces) schema_loc = util.nspath_eval('xsi:schemaLocation', context.namespaces) urlset.attrib[schema_loc] = \ '%s http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd' % \ context.namespaces['sitemap'] # get all records count, records = repos.query(constraint={}, maxrecords=99999999) LOGGER.info('Found %s records', count) for rec in records: url_ = etree.SubElement( urlset, util.nspath_eval('sitemap:url', context.namespaces)) uri = '%s?service=CSW&version=2.0.2&request=GetRepositoryItem&id=%s' % \ (url, getattr(rec, context.md_core_model['mappings']['pycsw:Identifier'])) etree.SubElement(url_, util.nspath_eval('sitemap:loc', context.namespaces)).text = uri # write to file LOGGER.info('Writing to %s', output_file) with open(output_file, 'wb') as ofile: ofile.write( etree.tostring(urlset, pretty_print=1, encoding='utf8', xml_declaration=1))
def refresh_harvested_records(context, database, table, url): """refresh / harvest all non-local records in repository""" from owslib.csw import CatalogueServiceWeb # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # get all harvested records count, records = repos.query(constraint={ 'where': "mdsource != 'local'", 'values': [] }) if int(count) > 0: LOGGER.info('Refreshing %s harvested records', count) csw = CatalogueServiceWeb(url) for rec in records: source = \ getattr(rec, context.md_core_model['mappings']['pycsw:Source']) schema = \ getattr(rec, context.md_core_model['mappings']['pycsw:Schema']) identifier = \ getattr(rec, context.md_core_model['mappings']['pycsw:Identifier']) LOGGER.info('Harvesting %s (identifier = %s) ...', source, identifier) # TODO: find a smarter way of catching this if schema == 'http://www.isotc211.org/2005/gmd': schema = 'http://www.isotc211.org/schemas/2005/gmd/' try: csw.harvest(source, schema) LOGGER.info(csw.response) except Exception as err: LOGGER.exception('Could not harvest') else: LOGGER.info('No harvested records')
def __init__(self, config: ConfigParser): """ constructor :param config: ConfigParser pycsw configuration dict :returns: `pycsw.ogc.api.API` instance """ self.config = config log.setup_logger(self.config) if self.config['server']['url'].startswith('${'): LOGGER.debug( f"Server URL is an environment variable: {self.config['server']['url']}" ) url_ = match_env_var(self.config['server']['url']) else: url_ = self.config['server']['url'] LOGGER.debug(f'Server URL: {url_}') self.config['server']['url'] = url_.rstrip('/') self.context = StaticContext() LOGGER.debug('Setting maxrecords') try: self.maxrecords = int(self.config['server']['maxrecords']) except KeyError: self.maxrecords = 10 LOGGER.debug(f'maxrecords: {self.maxrecords}') repo_filter = None if self.config.has_option('repository', 'filter'): repo_filter = self.config.get('repository', 'filter') self.orm = 'sqlalchemy' from pycsw.core import repository try: LOGGER.info('Loading default repository') self.repository = repository.Repository( self.config.get('repository', 'database'), self.context, # self.environ.get('local.app_root', None), None, self.config.get('repository', 'table'), repo_filter) LOGGER.debug(f'Repository loaded {self.repository.dbtype}') except Exception as err: msg = f'Could not load repository {err}' LOGGER.exception(msg) raise self.query_mappings = { 'type': self.repository.dataset.type, 'recordUpdated': self.repository.dataset.insert_date, 'title': self.repository.dataset.title, 'description': self.repository.dataset.abstract, 'keywords': self.repository.dataset.keywords, 'anytext': self.repository.dataset.anytext, 'bbox': self.repository.dataset.wkt_geometry }
def load(pycsw_config, ckan_url): database = pycsw_config.get('repository', 'database') table_name = pycsw_config.get('repository', 'table', 'records') context = pycsw.core.config.StaticContext() repo = repository.Repository(database, context, table=table_name) log.info('Started gathering CKAN datasets identifiers: {0}'.format( str(datetime.datetime.now()))) query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}' start = 0 gathered_records = {} while True: url = ckan_url + query % start response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing results = listing.get('results') if not results: break for result in results: gathered_records[result['id']] = { 'metadata_modified': result['metadata_modified'], 'harvest_object_id': result['extras']['harvest_object_id'], 'source': result['extras'].get('metadata_source') } start = start + 1000 log.debug('Gathered %s' % start) log.info('Gather finished ({0} datasets): {1}'.format( len(gathered_records.keys()), str(datetime.datetime.now()))) existing_records = {} query = repo.session.query(repo.dataset.ckan_id, repo.dataset.ckan_modified) for row in query: existing_records[row[0]] = row[1] repo.session.close() new = set(gathered_records) - set(existing_records) deleted = set(existing_records) - set(gathered_records) changed = set() for key in set(gathered_records) & set(existing_records): if gathered_records[key]['metadata_modified'] > existing_records[key]: changed.add(key) for ckan_id in deleted: try: repo.session.begin() repo.session.query( repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete() log.info('Deleted %s' % ckan_id) repo.session.commit() except Exception, err: repo.session.rollback() raise
def dispatch(self, writer=sys.stdout, write_headers=True): ''' Handle incoming HTTP request ''' if self.requesttype == 'GET': self.kvp = self.normalize_kvp(self.kvp) if (('version' in self.kvp and self.kvp['version'] == '2.0.2') or ('acceptversions' in self.kvp and '2.0.2' in self.kvp['acceptversions'])): self.request_version = '2.0.2' elif self.requesttype == 'POST': if self.request.find('2.0.2') != -1: self.request_version = '2.0.2' if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'sru'): self.mode = 'sru' self.request_version = '2.0.2' LOGGER.debug('SRU mode detected; processing request.') self.kvp = self.sru().request_sru2csw(self.kvp) if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'oaipmh'): self.mode = 'oaipmh' self.request_version = '2.0.2' LOGGER.debug('OAI-PMH mode detected; processing request.') self.oaiargs = dict((k, v) for k, v in self.kvp.items() if k) self.kvp = self.oaipmh().request(self.kvp) if self.request_version == '2.0.2': self.iface = csw2.Csw2(server_csw=self) self.context.set_model('csw') # configure transaction support, if specified in config self._gen_manager() # generate domain model # NOTE: We should probably avoid this sort of mutable state for WSGI if 'GetDomain' not in self.context.model['operations']: self.context.model['operations']['GetDomain'] = \ self.context.gen_domains() # generate distributed search model, if specified in config if self.config.has_option('server', 'federatedcatalogues'): LOGGER.debug('Configuring distributed search.') self.context.model['constraints']['FederatedCatalogues'] = \ {'values': []} for fedcat in \ self.config.get('server', 'federatedcatalogues').split(','): self.context.model\ ['constraints']['FederatedCatalogues']['values'].append(fedcat) for key, value in self.outputschemas.iteritems(): self.context.model['operations']['GetRecords']['parameters'][ 'outputSchema']['values'].append(value.NAMESPACE) self.context.model['operations']['GetRecordById']['parameters'][ 'outputSchema']['values'].append(value.NAMESPACE) if 'Harvest' in self.context.model['operations']: self.context.model['operations']['Harvest']['parameters'][ 'ResourceType']['values'].append(value.NAMESPACE) LOGGER.debug('Setting MaxRecordDefault') if self.config.has_option('server', 'maxrecords'): self.context.model['constraints']['MaxRecordDefault']['values'] = \ [self.config.get('server', 'maxrecords')] # load profiles if self.config.has_option('server', 'profiles'): self.profiles = pprofile.load_profiles( os.path.join('pycsw', 'plugins', 'profiles'), pprofile.Profile, self.config.get('server', 'profiles')) for prof in self.profiles['plugins'].keys(): tmp = self.profiles['plugins'][prof](self.context.model, self.context.namespaces, self.context) key = tmp.outputschema # to ref by outputschema self.profiles['loaded'][key] = tmp self.profiles['loaded'][key].extend_core( self.context.model, self.context.namespaces, self.config) LOGGER.debug('Profiles loaded: %s.' % self.profiles['loaded'].keys()) # init repository # look for tablename, set 'records' as default if not self.config.has_option('repository', 'table'): self.config.set('repository', 'table', 'records') repo_filter = None if self.config.has_option('repository', 'filter'): repo_filter = self.config.get('repository', 'filter') if (self.config.has_option('repository', 'source') and self.config.get('repository', 'source') == 'geonode'): # load geonode repository from pycsw.plugins.repository.geonode import geonode_ try: self.repository = \ geonode_.GeoNodeRepository(self.context) LOGGER.debug('GeoNode repository loaded (geonode): %s.' % \ self.repository.dbtype) except Exception as err: self.response = self.iface.exceptionreport( 'NoApplicableCode', 'service', 'Could not load repository (geonode): %s' % str(err)) elif (self.config.has_option('repository', 'source') and self.config.get('repository', 'source') == 'odc'): # load odc repository from pycsw.plugins.repository.odc import odc try: self.repository = \ odc.OpenDataCatalogRepository(self.context) LOGGER.debug('OpenDataCatalog repository loaded (geonode): %s.' % \ self.repository.dbtype) except Exception as err: self.response = self.iface.exceptionreport( 'NoApplicableCode', 'service', 'Could not load repository (odc): %s' % str(err)) else: # load default repository self.orm = 'sqlalchemy' from pycsw.core import repository try: self.repository = \ repository.Repository(self.config.get('repository', 'database'), self.context, self.environ.get('local.app_root', None), self.config.get('repository', 'table'), repo_filter) LOGGER.debug('Repository loaded (local): %s.' \ % self.repository.dbtype) except Exception as err: self.response = self.iface.exceptionreport( 'NoApplicableCode', 'service', 'Could not load repository (local): %s' % str(err)) if self.requesttype == 'POST': LOGGER.debug(self.iface.version) self.kvp = self.iface.parse_postdata(self.request) error = 0 if isinstance(self.kvp, str): # it's an exception error = 1 locator = 'service' text = self.kvp if (self.kvp.find('the document is not valid') != -1 or self.kvp.find('document not well-formed') != -1): code = 'NoApplicableCode' else: code = 'InvalidParameterValue' LOGGER.debug('HTTP Headers:\n%s.' % self.environ) LOGGER.debug('Parsed request parameters: %s' % self.kvp) if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'opensearch'): self.mode = 'opensearch' LOGGER.debug('OpenSearch mode detected; processing request.') self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom' if ((self.kvp == { '': '' } and self.request_version == '3.0.0') or (len(self.kvp) == 1 and 'config' in self.kvp)): LOGGER.debug('Turning on default csw30:Capabilities for base URL') self.kvp = { 'service': 'CSW', 'acceptversions': '3.0.0', 'request': 'GetCapabilities' } if 'HTTP_ACCEPT' in self.environ and 'application/opensearchdescription+xml' in self.environ[ 'HTTP_ACCEPT']: self.mode = 'opensearch' self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom' if error == 0: # test for the basic keyword values (service, version, request) basic_options = ['service', 'request'] if self.request_version == '2.0.2': basic_options.append('version') for k in basic_options: if k not in self.kvp: if (k in ['version', 'acceptversions'] and 'request' in self.kvp and self.kvp['request'] == 'GetCapabilities'): pass else: error = 1 locator = k code = 'MissingParameterValue' text = 'Missing keyword: %s' % k break # test each of the basic keyword values if error == 0: # test service if self.kvp['service'] != 'CSW': error = 1 locator = 'service' code = 'InvalidParameterValue' text = 'Invalid value for service: %s.\ Value MUST be CSW' % self.kvp['service'] # test version if ('version' in self.kvp and util.get_version_integer(self.kvp['version']) != util.get_version_integer(self.request_version) and self.kvp['request'] != 'GetCapabilities'): error = 1 locator = 'version' code = 'InvalidParameterValue' text = 'Invalid value for version: %s.\ Value MUST be 2.0.2 or 3.0.0' % self.kvp['version'] # check for GetCapabilities acceptversions if 'acceptversions' in self.kvp: for vers in self.kvp['acceptversions'].split(','): if (util.get_version_integer(vers) == util.get_version_integer( self.request_version)): break else: error = 1 locator = 'acceptversions' code = 'VersionNegotiationFailed' text = 'Invalid parameter value in acceptversions:\ %s. Value MUST be 2.0.2 or 3.0.0' % \ self.kvp['acceptversions'] # test request if self.kvp['request'] not in \ self.context.model['operations'].keys(): error = 1 locator = 'request' if self.kvp['request'] in ['Transaction', 'Harvest']: code = 'OperationNotSupported' text = '%s operations are not supported' % \ self.kvp['request'] else: code = 'InvalidParameterValue' text = 'Invalid value for request: %s' % \ self.kvp['request'] if error == 1: # return an ExceptionReport self.response = self.iface.exceptionreport(code, locator, text) else: # process per the request value if 'responsehandler' in self.kvp: # set flag to process asynchronously import threading self. async = True if ('requestid' not in self.kvp or self.kvp['requestid'] is None): import uuid self.kvp['requestid'] = str(uuid.uuid4()) if self.kvp['request'] == 'GetCapabilities': self.response = self.iface.getcapabilities() elif self.kvp['request'] == 'DescribeRecord': self.response = self.iface.describerecord() elif self.kvp['request'] == 'GetDomain': self.response = self.iface.getdomain() elif self.kvp['request'] == 'GetRecords': if self. async: # process asynchronously threading.Thread(target=self.iface.getrecords).start() self.response = self.iface._write_acknowledgement() else: self.response = self.iface.getrecords() elif self.kvp['request'] == 'GetRecordById': self.response = self.iface.getrecordbyid() elif self.kvp['request'] == 'GetRepositoryItem': self.response = self.iface.getrepositoryitem() elif self.kvp['request'] == 'Transaction': self.response = self.iface.transaction() elif self.kvp['request'] == 'Harvest': if self. async: # process asynchronously threading.Thread(target=self.iface.harvest).start() self.response = self.iface._write_acknowledgement() else: self.response = self.iface.harvest() else: self.response = self.iface.exceptionreport( 'InvalidParameterValue', 'request', 'Invalid request parameter: %s' % self.kvp['request']) if self.mode == 'sru': LOGGER.debug('SRU mode detected; processing response.') self.response = self.sru().response_csw2sru( self.response, self.environ) elif self.mode == 'opensearch': LOGGER.debug('OpenSearch mode detected; processing response.') self.response = self.opensearch().response_csw2opensearch( self.response, self.config) elif self.mode == 'oaipmh': LOGGER.debug('OAI-PMH mode detected; processing response.') self.response = self.oaipmh().response( self.response, self.oaiargs, self.repository, self.config.get('server', 'url')) return self._write_response()
def load(pycsw_config, ckan_url): database = pycsw_config.get("repository", "database") table_name = pycsw_config.get("repository", "table", "records") context = pycsw.core.config.StaticContext() repo = repository.Repository(database, context, table=table_name) log.info("Started gathering CKAN datasets identifiers: {0}".format( str(datetime.datetime.now()))) query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,' \ 'extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}' start = 0 gathered_records = {} while True: url = ckan_url + query % start response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError("Wrong API response: %s" % listing) results = listing.get("results") if not results: break for result in results: gathered_records[result["id"]] = { "metadata_modified": result["metadata_modified"], "harvest_object_id": result["extras"]["harvest_object_id"], "source": result["extras"].get("metadata_source"), } start = start + 1000 log.debug("Gathered %s" % start) log.info("Gather finished ({0} datasets): {1}".format( len(gathered_records.keys()), str(datetime.datetime.now()))) existing_records = {} query = repo.session.query(repo.dataset.ckan_id, repo.dataset.ckan_modified) for row in query: existing_records[row[0]] = row[1] repo.session.close() new = set(gathered_records) - set(existing_records) deleted = set(existing_records) - set(gathered_records) changed = set() for key in set(gathered_records) & set(existing_records): if gathered_records[key]["metadata_modified"] > existing_records[key]: changed.add(key) for ckan_id in deleted: try: repo.session.begin() repo.session.query( repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete() log.info("Deleted %s" % ckan_id) repo.session.commit() except Exception: repo.session.rollback() raise for ckan_id in new: ckan_info = gathered_records[ckan_id] record = get_record(context, repo, ckan_url, ckan_id, ckan_info) if not record: log.info("Skipped record %s" % ckan_id) continue try: repo.insert(record, "local", util.get_today_and_now()) log.info("Inserted %s" % ckan_id) except Exception as err: log.error("ERROR: not inserted %s Error:%s" % (ckan_id, err)) for ckan_id in changed: ckan_info = gathered_records[ckan_id] record = get_record(context, repo, ckan_url, ckan_id, ckan_info) if not record: continue update_dict = dict([(getattr(repo.dataset, key), getattr(record, key)) for key in record.__dict__.keys() if key != "_sa_instance_state"]) try: repo.session.begin() repo.session.query( repo.dataset).filter_by(ckan_id=ckan_id).update(update_dict) repo.session.commit() log.info("Changed %s" % ckan_id) except Exception as err: repo.session.rollback() raise RuntimeError("ERROR: %s" % str(err))
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" from sqlalchemy.exc import DBAPIError repo = repository.Repository(database, context, table=table) file_list = [] loaded_files = set() if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except etree.XMLSyntaxError as err: LOGGER.error('XML document "%s" is not well-formed', recfile, exc_info=True) continue except Exception as err: LOGGER.exception('XML document "%s" is not well-formed', recfile) continue try: record = metadata.parse_record(context, exml, repo) except Exception as err: LOGGER.exception('Could not parse "%s" as an XML record', recfile) continue for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) loaded_files.add(recfile) LOGGER.info('Inserted %s', recfile) except Exception as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated %s', recfile) loaded_files.add(recfile) else: if isinstance(err, DBAPIError) and err.args: # Pull a decent database error message and not the full SQL that was run # since INSERT SQL statements are rather large. LOGGER.error('ERROR: %s not inserted: %s', recfile, err.args[0], exc_info=True) else: LOGGER.error('ERROR: %s not inserted: %s', recfile, err, exc_info=True) return tuple(loaded_files)
def optimize_db(context, database, table): """Optimize database""" LOGGER.info('Optimizing database %s', database) repos = repository.Repository(database, context, table=table) repos.engine.connect().execute('VACUUM ANALYZE').close()
def dispatch(self, writer=sys.stdout, write_headers=True): """ Handle incoming HTTP request """ if self.requesttype == 'GET': self.kvp = self.normalize_kvp(self.kvp) version_202 = ('version' in self.kvp and self.kvp['version'] == '2.0.2') accept_version_202 = ('acceptversions' in self.kvp and '2.0.2' in self.kvp['acceptversions']) if version_202 or accept_version_202: self.request_version = '2.0.2' elif self.requesttype == 'POST': if self.request.find(b'2.0.2') != -1: self.request_version = '2.0.2' if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'sru'): self.mode = 'sru' self.request_version = '2.0.2' LOGGER.info('SRU mode detected; processing request') self.kvp = self.sru().request_sru2csw(self.kvp) if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'oaipmh'): self.mode = 'oaipmh' self.request_version = '2.0.2' LOGGER.info('OAI-PMH mode detected; processing request.') self.oaiargs = dict((k, v) for k, v in self.kvp.items() if k) self.kvp = self.oaipmh().request(self.kvp) if self.request_version == '2.0.2': self.iface = csw2.Csw2(server_csw=self) self.context.set_model('csw') # configure transaction support, if specified in config self._gen_manager() namespaces = self.context.namespaces ops = self.context.model['operations'] constraints = self.context.model['constraints'] # generate domain model # NOTE: We should probably avoid this sort of mutable state for WSGI if 'GetDomain' not in ops: ops['GetDomain'] = self.context.gen_domains() # generate distributed search model, if specified in config if self.config.has_option('server', 'federatedcatalogues'): LOGGER.info('Configuring distributed search') constraints['FederatedCatalogues'] = {'values': []} for fedcat in self.config.get('server', 'federatedcatalogues').split(','): LOGGER.debug('federated catalogue: %s', fedcat) constraints['FederatedCatalogues']['values'].append(fedcat) for key, value in self.outputschemas.items(): get_records_params = ops['GetRecords']['parameters'] get_records_params['outputSchema']['values'].append( value.NAMESPACE) get_records_by_id_params = ops['GetRecordById']['parameters'] get_records_by_id_params['outputSchema']['values'].append( value.NAMESPACE) if 'Harvest' in ops: harvest_params = ops['Harvest']['parameters'] harvest_params['ResourceType']['values'].append( value.NAMESPACE) LOGGER.info('Setting MaxRecordDefault') if self.config.has_option('server', 'maxrecords'): constraints['MaxRecordDefault']['values'] = [ self.config.get('server', 'maxrecords')] # load profiles if self.config.has_option('server', 'profiles'): self.profiles = pprofile.load_profiles( os.path.join('pycsw', 'plugins', 'profiles'), pprofile.Profile, self.config.get('server', 'profiles') ) for prof in self.profiles['plugins'].keys(): tmp = self.profiles['plugins'][prof](self.context.model, namespaces, self.context) key = tmp.outputschema # to ref by outputschema self.profiles['loaded'][key] = tmp self.profiles['loaded'][key].extend_core(self.context.model, namespaces, self.config) LOGGER.debug('Profiles loaded: %s' % list(self.profiles['loaded'].keys())) # init repository # look for tablename, set 'records' as default if not self.config.has_option('repository', 'table'): self.config.set('repository', 'table', 'records') repo_filter = None if self.config.has_option('repository', 'filter'): repo_filter = self.config.get('repository', 'filter') if self.config.has_option('repository', 'source'): # load custom repository rs = self.config.get('repository', 'source') rs_modname, rs_clsname = rs.rsplit('.', 1) rs_mod = __import__(rs_modname, globals(), locals(), [rs_clsname]) rs_cls = getattr(rs_mod, rs_clsname) try: self.repository = rs_cls(self.context, repo_filter) LOGGER.debug('Custom repository %s loaded (%s)', rs, self.repository.dbtype) except Exception as err: msg = 'Could not load custom repository' LOGGER.exception(msg) self.response = self.iface.exceptionreport( 'NoApplicableCode', 'service', msg) else: # load default repository self.orm = 'sqlalchemy' from pycsw.core import repository try: LOGGER.info('Loading default repository') self.repository = repository.Repository( self.config.get('repository', 'database'), self.context, self.environ.get('local.app_root', None), self.config.get('repository', 'table'), repo_filter ) LOGGER.debug( 'Repository loaded (local): %s.' % self.repository.dbtype) except Exception as err: msg = 'Could not load repository (local)' LOGGER.exception(msg) self.response = self.iface.exceptionreport( 'NoApplicableCode', 'service', msg) if self.requesttype == 'POST': LOGGER.debug('HTTP POST request') LOGGER.debug('CSW version: %s', self.iface.version) self.kvp = self.iface.parse_postdata(self.request) error = 0 if isinstance(self.kvp, str): # it's an exception error = 1 locator = 'service' text = self.kvp if (self.kvp.find('the document is not valid') != -1 or self.kvp.find('document not well-formed') != -1): code = 'NoApplicableCode' else: code = 'InvalidParameterValue' LOGGER.debug('HTTP Headers:\n%s.', self.environ) LOGGER.debug('Parsed request parameters: %s', self.kvp) if (not isinstance(self.kvp, str) and 'mode' in self.kvp and self.kvp['mode'] == 'opensearch'): self.mode = 'opensearch' LOGGER.info('OpenSearch mode detected; processing request.') self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom' if ((len(self.kvp) == 0 and self.request_version == '3.0.0') or (len(self.kvp) == 1 and 'config' in self.kvp)): LOGGER.info('Turning on default csw30:Capabilities for base URL') self.kvp = { 'service': 'CSW', 'acceptversions': '3.0.0', 'request': 'GetCapabilities' } http_accept = self.environ.get('HTTP_ACCEPT', '') if 'application/opensearchdescription+xml' in http_accept: self.mode = 'opensearch' self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom' if error == 0: # test for the basic keyword values (service, version, request) basic_options = ['service', 'request'] request = self.kvp.get('request', '') own_version_integer = util.get_version_integer( self.request_version) if self.request_version == '2.0.2': basic_options.append('version') for k in basic_options: if k not in self.kvp: if (k in ['version', 'acceptversions'] and request == 'GetCapabilities'): pass else: error = 1 locator = k code = 'MissingParameterValue' text = 'Missing keyword: %s' % k break # test each of the basic keyword values if error == 0: # test service if self.kvp['service'] != 'CSW': error = 1 locator = 'service' code = 'InvalidParameterValue' text = 'Invalid value for service: %s.\ Value MUST be CSW' % self.kvp['service'] # test version kvp_version = self.kvp.get('version', '') try: kvp_version_integer = util.get_version_integer(kvp_version) except Exception as err: kvp_version_integer = 'invalid_value' if (request != 'GetCapabilities' and kvp_version_integer != own_version_integer): error = 1 locator = 'version' code = 'InvalidParameterValue' text = ('Invalid value for version: %s. Value MUST be ' '2.0.2 or 3.0.0' % kvp_version) # check for GetCapabilities acceptversions if 'acceptversions' in self.kvp: for vers in self.kvp['acceptversions'].split(','): vers_integer = util.get_version_integer(vers) if vers_integer == own_version_integer: break else: error = 1 locator = 'acceptversions' code = 'VersionNegotiationFailed' text = ('Invalid parameter value in ' 'acceptversions: %s. Value MUST be ' '2.0.2 or 3.0.0' % self.kvp['acceptversions']) # test request if self.kvp['request'] not in \ self.context.model['operations']: error = 1 locator = 'request' if request in ['Transaction', 'Harvest']: code = 'OperationNotSupported' text = '%s operations are not supported' % request else: code = 'InvalidParameterValue' text = 'Invalid value for request: %s' % request if error == 1: # return an ExceptionReport LOGGER.error('basic service options error: %s, %s, %s', code, locator, text) self.response = self.iface.exceptionreport(code, locator, text) else: # process per the request value if 'responsehandler' in self.kvp: # set flag to process asynchronously import threading self.async = True request_id = self.kvp.get('requestid', None) if request_id is None: import uuid self.kvp['requestid'] = str(uuid.uuid4()) if self.kvp['request'] == 'GetCapabilities': self.response = self.iface.getcapabilities() elif self.kvp['request'] == 'DescribeRecord': self.response = self.iface.describerecord() elif self.kvp['request'] == 'GetDomain': self.response = self.iface.getdomain() elif self.kvp['request'] == 'GetRecords': if self.async: # process asynchronously threading.Thread(target=self.iface.getrecords).start() self.response = self.iface._write_acknowledgement() else: self.response = self.iface.getrecords() elif self.kvp['request'] == 'GetRecordById': self.response = self.iface.getrecordbyid() elif self.kvp['request'] == 'GetRepositoryItem': self.response = self.iface.getrepositoryitem() elif self.kvp['request'] == 'Transaction': self.response = self.iface.transaction() elif self.kvp['request'] == 'Harvest': if self.async: # process asynchronously threading.Thread(target=self.iface.harvest).start() self.response = self.iface._write_acknowledgement() else: self.response = self.iface.harvest() else: self.response = self.iface.exceptionreport( 'InvalidParameterValue', 'request', 'Invalid request parameter: %s' % self.kvp['request'] ) LOGGER.info('Request processed') if self.mode == 'sru': LOGGER.info('SRU mode detected; processing response.') self.response = self.sru().response_csw2sru(self.response, self.environ) elif self.mode == 'opensearch': LOGGER.info('OpenSearch mode detected; processing response.') self.response = self.opensearch().response_csw2opensearch( self.response, self.config) elif self.mode == 'oaipmh': LOGGER.info('OAI-PMH mode detected; processing response.') self.response = self.oaipmh().response( self.response, self.oaiargs, self.repository, self.config.get('server', 'url') ) return self._write_response()
def __init__(self, config: ConfigParser): """ constructor :param config: ConfigParser pycsw configuration dict :returns: `pycsw.ogc.api.API` instance """ self.config = config log.setup_logger(self.config) if self.config['server']['url'].startswith('${'): LOGGER.debug(f"Server URL is an environment variable: {self.config['server']['url']}") url_ = match_env_var(self.config['server']['url']) else: url_ = self.config['server']['url'] LOGGER.debug(f'Server URL: {url_}') self.config['server']['url'] = url_.rstrip('/') self.context = StaticContext() LOGGER.debug('Setting maxrecords') try: self.maxrecords = int(self.config['server']['maxrecords']) except KeyError: self.maxrecords = 10 LOGGER.debug(f'maxrecords: {self.maxrecords}') repo_filter = None if self.config.has_option('repository', 'filter'): repo_filter = self.config.get('repository', 'filter') custom_mappings_path = self.config.get('repository', 'mappings', fallback=None) if custom_mappings_path is not None: md_core_model = load_custom_repo_mappings(custom_mappings_path) if md_core_model is not None: self.context.md_core_model = md_core_model else: LOGGER.exception( 'Could not load custom mappings: %s', custom_mappings_path) self.orm = 'sqlalchemy' from pycsw.core import repository try: LOGGER.info('Loading default repository') self.repository = repository.Repository( self.config.get('repository', 'database'), self.context, # self.environ.get('local.app_root', None), None, self.config.get('repository', 'table'), repo_filter ) LOGGER.debug(f'Repository loaded {self.repository.dbtype}') except Exception as err: msg = f'Could not load repository {err}' LOGGER.exception(msg) raise self.query_mappings = { 'type': self.repository.dataset.type, 'parentidentifier': self.repository.dataset.parentidentifier, 'collections': self.repository.dataset.parentidentifier, 'recordUpdated': self.repository.dataset.insert_date, 'title': self.repository.dataset.title, 'description': self.repository.dataset.abstract, 'keywords': self.repository.dataset.keywords, 'anytext': self.repository.dataset.anytext, 'bbox': self.repository.dataset.wkt_geometry, 'date': self.repository.dataset.date, 'time_begin': self.repository.dataset.time_begin, 'time_end': self.repository.dataset.time_end, 'platform': self.repository.dataset.platform, 'instrument': self.repository.dataset.instrument, 'sensortype': self.repository.dataset.sensortype } if self.repository.dbtype == 'postgresql+postgis+native': self.query_mappings['bbox'] = self.repository.dataset.wkb_geometry