示例#1
0
文件: admin.py 项目: eva-j/pycsw
def delete_records(context, database, table):
    """Deletes all records from repository"""

    LOGGER.info('Deleting all records')
    
    repo = repository.Repository(database, context, table=table)
    repo.delete(constraint={'where': '', 'values': []})
示例#2
0
def refresh_harvested_records(context, database, table, url):
    """refresh / harvest all non-local records in repository"""
    from owslib.csw import CatalogueServiceWeb

    # get configuration and init repo connection
    repos = repository.Repository(database, context, table=table)

    # get all harvested records
    count, records = repos.query(constraint={'where': 'source != "local"'})

    if int(count) > 0:
        LOGGER.info('Refreshing %s harvested records', count)
        csw = CatalogueServiceWeb(url)

        for rec in records:
            source = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Source'])
            schema = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Schema'])
            identifier = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Identifier'])

            LOGGER.info('Harvesting %s (identifier = %s) ...', source,
                        identifier)
            # TODO: find a smarter way of catching this
            if schema == 'http://www.isotc211.org/2005/gmd':
                schema = 'http://www.isotc211.org/schemas/2005/gmd/'
            try:
                csw.harvest(source, schema)
                LOGGER.info(csw.response)
            except Exception, err:
                LOGGER.warn(err)
示例#3
0
文件: admin.py 项目: rouault/pycsw
def load_records(context,
                 database,
                 table,
                 xml_dirpath,
                 recursive=False,
                 force_update=False):
    """Load metadata records from directory of files to database"""
    repo = repository.Repository(database, context, table=table)

    file_list = []

    if recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile))
    else:
        for rec in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(rec)

    total = len(file_list)
    counter = 0

    for recfile in sorted(file_list):
        counter += 1
        LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total)
        # read document
        try:
            exml = etree.parse(recfile)
        except Exception as err:
            LOGGER.warn('XML document is not well-formed: %s', str(err))
            continue

        record = metadata.parse_record(context, exml, repo)

        for rec in record:
            LOGGER.info('Inserting %s %s into database %s, table %s ....',
                        rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                repo.insert(rec, 'local', util.get_today_and_now())
                LOGGER.info('Inserted')
            except RuntimeError as err:
                if force_update:
                    LOGGER.info('Record exists. Updating.')
                    repo.update(rec)
                    LOGGER.info('Updated')
                else:
                    LOGGER.warn('ERROR: not inserted %s', err)
示例#4
0
def export_records(context, database, table, xml_dirpath):
    """Export metadata records from database to directory of files"""
    repo = repository.Repository(database, context, table=table)

    LOGGER.info('Querying database %s, table %s ....', database, table)
    records = repo.session.query(repo.dataset)

    LOGGER.info('Found %d records\n', records.count())

    LOGGER.info('Exporting records\n')

    dirpath = os.path.abspath(xml_dirpath)

    if not os.path.exists(dirpath):
        LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
        try:
            os.makedirs(dirpath)
        except OSError, err:
            raise RuntimeError('Could not create %s %s' % (dirpath, err))
示例#5
0
def gen_sitemap(context, database, table, url, output_file):
    """generate an XML sitemap from all records in repository"""

    # get configuration and init repo connection
    repos = repository.Repository(database, context, table=table)

    # write out sitemap document
    urlset = etree.Element(util.nspath_eval('sitemap:urlset',
                                            context.namespaces),
                           nsmap=context.namespaces)

    schema_loc = util.nspath_eval('xsi:schemaLocation', context.namespaces)

    urlset.attrib[schema_loc] = \
        '%s http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd' % \
        context.namespaces['sitemap']

    # get all records
    count, records = repos.query(constraint={}, maxrecords=99999999)

    LOGGER.info('Found %s records', count)

    for rec in records:
        url = etree.SubElement(
            urlset, util.nspath_eval('sitemap:url', context.namespaces))
        uri = '%s?service=CSW&version=2.0.2&request=GetRepositoryItem&id=%s' % \
            (url,
             getattr(rec,
                     context.md_core_model['mappings']['pycsw:Identifier']))
        etree.SubElement(url,
                         util.nspath_eval('sitemap:loc',
                                          context.namespaces)).text = uri

    # write to file
    LOGGER.info('Writing to %s', output_file)
    with open(output_file, 'w') as ofile:
        ofile.write(
            etree.tostring(urlset,
                           pretty_print=1,
                           encoding='utf8',
                           xml_declaration=1))
示例#6
0
文件: admin.py 项目: eva-j/pycsw
def export_records(context, database, table, xml_dirpath):
    """Export metadata records from database to directory of files"""
    repo = repository.Repository(database, context, table=table)

    LOGGER.info('Querying database %s, table %s ....', database, table)
    records = repo.session.query(repo.dataset)

    LOGGER.info('Found %d records\n', records.count())

    LOGGER.info('Exporting records\n')

    dirpath = os.path.abspath(xml_dirpath)

    if not os.path.exists(dirpath):
        LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
        try:
            os.makedirs(dirpath)
        except OSError as err:
            raise RuntimeError('Could not create %s %s' % (dirpath, err))

    for record in records.all():
        identifier = \
            getattr(record,
                    context.md_core_model['mappings']['pycsw:Identifier'])

        LOGGER.info('Processing %s', identifier)
        if identifier.find(':') != -1:  # it's a URN
            # sanitize identifier
            LOGGER.info(' Sanitizing identifier')
            identifier = identifier.split(':')[-1]

        # write to XML document
        filename = os.path.join(dirpath, '%s.xml' % identifier)
        try:
            LOGGER.info('Writing to file %s', filename)
            with open(filename, 'w') as xml:
                xml.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                xml.write(record.xml)
        except Exception as err:
            raise RuntimeError("Error writing to %s" % filename, err)
示例#7
0
def optimize_db(context, database, table):
    """Optimize database"""

    LOGGER.info('Optimizing database %s', database)
    repos = repository.Repository(database, context, table=table)
    repos.engine.connect().execute('VACUUM ANALYZE').close()
示例#8
0
def load(pycsw_config, ckan_url):

    database = pycsw_config.get('repository', 'database')
    table_name = pycsw_config.get('repository', 'table', 'records')

    context = pycsw.config.StaticContext()
    repo = repository.Repository(database, context, table=table_name)

    log.info('Started gathering CKAN datasets identifiers: {0}'.format(
        str(datetime.datetime.now())))

    query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source,organization", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}'

    start = 0

    gathered_records = {}

    while True:
        url = ckan_url + query % start

        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        results = listing.get('results')
        if not results:
            break
        for result in results:
            gathered_records[result['id']] = {
                'metadata_modified': result['metadata_modified'],
                'harvest_object_id': result['extras']['harvest_object_id'],
                'source': result['extras'].get('metadata_source'),
                'organization': result['organization'],
            }

        start = start + 1000
        log.debug('Gathered %s' % start)

    log.info('Gather finished ({0} datasets): {1}'.format(
        len(gathered_records.keys()), str(datetime.datetime.now())))

    existing_records = {}

    query = repo.session.query(repo.dataset.ckan_id,
                               repo.dataset.ckan_modified)
    for row in query:
        existing_records[row[0]] = row[1]
    repo.session.close()

    new = set(gathered_records) - set(existing_records)
    deleted = set(existing_records) - set(gathered_records)
    changed = set()

    for key in set(gathered_records) & set(existing_records):
        if gathered_records[key]['metadata_modified'] > existing_records[key]:
            changed.add(key)

    for ckan_id in deleted:
        try:
            repo.session.begin()
            repo.session.query(
                repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete()
            log.info('Deleted %s' % ckan_id)
            repo.session.commit()
        except Exception, err:
            repo.session.rollback()
            raise
示例#9
0
                      (ckan_id, err))
            return
        try:
            record = metadata.parse_record(context, xml, repo)[0]
        except Exception, err:
            log.error('Could not extract metadata from %s, Error: %s' %
                      (ckan_id, err))
            return
        return record

    # Now that we've defined the local functions, let's actually run the parent function
    database = pycsw_config.get('repository', 'database')
    table_name = pycsw_config.get('repository', 'table', 'records')

    context = pycsw.config.StaticContext()
    repo = repository.Repository(database, context, table=table_name)

    log.info('Started gathering CKAN datasets identifiers: {0}'.format(
        str(datetime.datetime.now())))

    gathered_records = {}

    results = parse_datastore(ckan_url)
    package_ids = []
    for result in results:
        package_id = parse_resource(result, ckan_url)
        if not package_id in package_ids:
            package_ids.append(package_id)

    for id in package_ids:
        api_query = 'api/3/action/package_show?id=%s' % id