def delete_records(context, database, table): """Deletes all records from repository""" LOGGER.info('Deleting all records') repo = repository.Repository(database, context, table=table) repo.delete(constraint={'where': '', 'values': []})
def refresh_harvested_records(context, database, table, url): """refresh / harvest all non-local records in repository""" from owslib.csw import CatalogueServiceWeb # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # get all harvested records count, records = repos.query(constraint={'where': 'source != "local"'}) if int(count) > 0: LOGGER.info('Refreshing %s harvested records', count) csw = CatalogueServiceWeb(url) for rec in records: source = \ getattr(rec, context.md_core_model['mappings']['pycsw:Source']) schema = \ getattr(rec, context.md_core_model['mappings']['pycsw:Schema']) identifier = \ getattr(rec, context.md_core_model['mappings']['pycsw:Identifier']) LOGGER.info('Harvesting %s (identifier = %s) ...', source, identifier) # TODO: find a smarter way of catching this if schema == 'http://www.isotc211.org/2005/gmd': schema = 'http://www.isotc211.org/schemas/2005/gmd/' try: csw.harvest(source, schema) LOGGER.info(csw.response) except Exception, err: LOGGER.warn(err)
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" repo = repository.Repository(database, context, table=table) file_list = [] if recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile) except Exception as err: LOGGER.warn('XML document is not well-formed: %s', str(err)) continue record = metadata.parse_record(context, exml, repo) for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) LOGGER.info('Inserted') except RuntimeError as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated') else: LOGGER.warn('ERROR: not inserted %s', err)
def export_records(context, database, table, xml_dirpath): """Export metadata records from database to directory of files""" repo = repository.Repository(database, context, table=table) LOGGER.info('Querying database %s, table %s ....', database, table) records = repo.session.query(repo.dataset) LOGGER.info('Found %d records\n', records.count()) LOGGER.info('Exporting records\n') dirpath = os.path.abspath(xml_dirpath) if not os.path.exists(dirpath): LOGGER.info('Directory %s does not exist. Creating...', dirpath) try: os.makedirs(dirpath) except OSError, err: raise RuntimeError('Could not create %s %s' % (dirpath, err))
def gen_sitemap(context, database, table, url, output_file): """generate an XML sitemap from all records in repository""" # get configuration and init repo connection repos = repository.Repository(database, context, table=table) # write out sitemap document urlset = etree.Element(util.nspath_eval('sitemap:urlset', context.namespaces), nsmap=context.namespaces) schema_loc = util.nspath_eval('xsi:schemaLocation', context.namespaces) urlset.attrib[schema_loc] = \ '%s http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd' % \ context.namespaces['sitemap'] # get all records count, records = repos.query(constraint={}, maxrecords=99999999) LOGGER.info('Found %s records', count) for rec in records: url = etree.SubElement( urlset, util.nspath_eval('sitemap:url', context.namespaces)) uri = '%s?service=CSW&version=2.0.2&request=GetRepositoryItem&id=%s' % \ (url, getattr(rec, context.md_core_model['mappings']['pycsw:Identifier'])) etree.SubElement(url, util.nspath_eval('sitemap:loc', context.namespaces)).text = uri # write to file LOGGER.info('Writing to %s', output_file) with open(output_file, 'w') as ofile: ofile.write( etree.tostring(urlset, pretty_print=1, encoding='utf8', xml_declaration=1))
def export_records(context, database, table, xml_dirpath): """Export metadata records from database to directory of files""" repo = repository.Repository(database, context, table=table) LOGGER.info('Querying database %s, table %s ....', database, table) records = repo.session.query(repo.dataset) LOGGER.info('Found %d records\n', records.count()) LOGGER.info('Exporting records\n') dirpath = os.path.abspath(xml_dirpath) if not os.path.exists(dirpath): LOGGER.info('Directory %s does not exist. Creating...', dirpath) try: os.makedirs(dirpath) except OSError as err: raise RuntimeError('Could not create %s %s' % (dirpath, err)) for record in records.all(): identifier = \ getattr(record, context.md_core_model['mappings']['pycsw:Identifier']) LOGGER.info('Processing %s', identifier) if identifier.find(':') != -1: # it's a URN # sanitize identifier LOGGER.info(' Sanitizing identifier') identifier = identifier.split(':')[-1] # write to XML document filename = os.path.join(dirpath, '%s.xml' % identifier) try: LOGGER.info('Writing to file %s', filename) with open(filename, 'w') as xml: xml.write('<?xml version="1.0" encoding="UTF-8"?>\n') xml.write(record.xml) except Exception as err: raise RuntimeError("Error writing to %s" % filename, err)
def optimize_db(context, database, table): """Optimize database""" LOGGER.info('Optimizing database %s', database) repos = repository.Repository(database, context, table=table) repos.engine.connect().execute('VACUUM ANALYZE').close()
def load(pycsw_config, ckan_url): database = pycsw_config.get('repository', 'database') table_name = pycsw_config.get('repository', 'table', 'records') context = pycsw.config.StaticContext() repo = repository.Repository(database, context, table=table_name) log.info('Started gathering CKAN datasets identifiers: {0}'.format( str(datetime.datetime.now()))) query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source,organization", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}' start = 0 gathered_records = {} while True: url = ckan_url + query % start response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing results = listing.get('results') if not results: break for result in results: gathered_records[result['id']] = { 'metadata_modified': result['metadata_modified'], 'harvest_object_id': result['extras']['harvest_object_id'], 'source': result['extras'].get('metadata_source'), 'organization': result['organization'], } start = start + 1000 log.debug('Gathered %s' % start) log.info('Gather finished ({0} datasets): {1}'.format( len(gathered_records.keys()), str(datetime.datetime.now()))) existing_records = {} query = repo.session.query(repo.dataset.ckan_id, repo.dataset.ckan_modified) for row in query: existing_records[row[0]] = row[1] repo.session.close() new = set(gathered_records) - set(existing_records) deleted = set(existing_records) - set(gathered_records) changed = set() for key in set(gathered_records) & set(existing_records): if gathered_records[key]['metadata_modified'] > existing_records[key]: changed.add(key) for ckan_id in deleted: try: repo.session.begin() repo.session.query( repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete() log.info('Deleted %s' % ckan_id) repo.session.commit() except Exception, err: repo.session.rollback() raise
(ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return return record # Now that we've defined the local functions, let's actually run the parent function database = pycsw_config.get('repository', 'database') table_name = pycsw_config.get('repository', 'table', 'records') context = pycsw.config.StaticContext() repo = repository.Repository(database, context, table=table_name) log.info('Started gathering CKAN datasets identifiers: {0}'.format( str(datetime.datetime.now()))) gathered_records = {} results = parse_datastore(ckan_url) package_ids = [] for result in results: package_id = parse_resource(result, ckan_url) if not package_id in package_ids: package_ids.append(package_id) for id in package_ids: api_query = 'api/3/action/package_show?id=%s' % id