def harvest_source_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id', None) source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format( harvest_source_id=harvest_source_id) model = context['model'] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, { 'id': harvest_source_id }) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {'id': harvest_source_id}
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type==DATASET_TYPE_NAME) \ .filter(model.Package.state==u'active') \ .all() package_index = PackageSearchIndex() for package in packages: if 'extras_as_string'in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')(context, {'id': package.id}) log.debug('Updating search index for harvest source {0}'.format(package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info('Updated search index for {0} harvest sources'.format(len(packages)))
def harvest_sources_reindex(context, data_dict): """ Reindexes all harvest source datasets with the latest status """ log.info("Reindexing all harvest sources") check_access("harvest_sources_reindex", context, data_dict) model = context["model"] packages = ( model.Session.query(model.Package) .filter(model.Package.type == DATASET_TYPE_NAME) .filter(model.Package.state == u"active") .all() ) package_index = PackageSearchIndex() reindex_context = {"defer_commit": True} for package in packages: get_action("harvest_source_reindex")(reindex_context, {"id": package.id}) package_index.commit() return True
def harvest_sources_reindex(context, data_dict): """ Reindexes all harvest source datasets with the latest status """ log.info("Reindexing all harvest sources") check_access("harvest_sources_reindex", context, data_dict) model = context["model"] packages = ( model.Session.query(model.Package) .filter(model.Package.type == DATASET_TYPE_NAME) .filter(model.Package.state == u"active") .all() ) package_index = PackageSearchIndex() for package in packages: if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": package.id}) log.debug("Updating search index for harvest source {0}".format(package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info("Updated search index for {0} harvest sources".format(len(packages)))
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type == DATASET_TYPE_NAME) \ .filter(model.Package.state == u'active') \ .all() package_index = PackageSearchIndex() reindex_context = {'defer_commit': True} for package in packages: get_action('harvest_source_reindex')(reindex_context, { 'id': package.id }) package_index.commit() return True
def harvest_source_reindex(context, data_dict): '''Reindex a single harvest source''' harvest_source_id = logic.get_or_bust(data_dict, 'id') defer_commit = context.get('defer_commit', False) if 'extras_as_string'in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source_id}) log.debug('Updating search index for harvest source: %s', package_dict.get('name') or harvest_source_id) # Remove configuration values new_dict = {} if package_dict.get('config'): config = json.loads(package_dict['config']) for key, value in package_dict.iteritems(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def harvest_source_reindex(context, data_dict): '''Reindex a single harvest source''' harvest_source_id = logic.get_or_bust(data_dict, 'id') defer_commit = context.get('defer_commit', False) if 'extras_as_string' in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source_id }) log.debug('Updating search index for harvest source: %s', package_dict.get('name') or harvest_source_id) # Remove configuration values new_dict = {} try: config = json.loads(package_dict.get('config', '')) except ValueError: config = {} for key, value in package_dict.items(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def harvest_source_clear(context, data_dict): """ Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string """ check_access("harvest_source_clear", context, data_dict) harvest_source_id = data_dict.get("id", None) source = HarvestSource.get(harvest_source_id) if not source: log.error("Harvest source %s does not exist", harvest_source_id) raise NotFound("Harvest source %s does not exist" % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;""".format( harvest_source_id=harvest_source_id ) model = context["model"] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id}) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {"id": harvest_source_id}
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def _update_search_index(package_id, log): ''' Tells CKAN to update its search index for a given package. ''' from ckan import model from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() context_ = {'model': model, 'ignore_auth': True, 'session': model.Session, 'use_cache': False, 'validate': False} package = toolkit.get_action('package_show')(context_, {'id': package_id}) package_index.index_package(package, defer_commit=False) log.info('Search indexed %s', package['name'])
def _update_search_index(package_id, log): """ Tells CKAN to update its search index for a given package. """ from ckan import model from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() context_ = {"model": model, "ignore_auth": True, "session": model.Session, "use_cache": False, "validate": False} package = toolkit.get_action("package_show")(context_, {"id": package_id}) package_index.index_package(package, defer_commit=False) log.info("Search indexed %s", package["name"])
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type==DATASET_TYPE_NAME) \ .filter(model.Package.state==u'active') \ .all() package_index = PackageSearchIndex() for package in packages: if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, { 'id': package.id }) log.debug('Updating search index for harvest source {0}'.format( package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info('Updated search index for {0} harvest sources'.format( len(packages)))
def harvest_source_reindex(context, data_dict): """Reindex a single harvest source""" harvest_source_id = logic.get_or_bust(data_dict, "id") defer_commit = context.get("defer_commit", False) if "extras_as_string" in context: del context["extras_as_string"] context.update({"ignore_auth": True}) package_dict = logic.get_action("harvest_source_show")(context, {"id": harvest_source_id}) log.debug("Updating search index for harvest source {0}".format(harvest_source_id)) # Remove configuration values new_dict = {} if package_dict.get("config"): config = json.loads(package_dict["config"]) for key, value in package_dict.iteritems(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type==DATASET_TYPE_NAME) \ .filter(model.Package.state==u'active') \ .all() package_index = PackageSearchIndex() reindex_context = {'defer_commit': True} for package in packages: get_action('harvest_source_reindex')(reindex_context, {'id': package.id}) package_index.commit() return True
def import_stage(self, harvest_object): log = logging.getLogger(__name__ + '.import') log.debug('%s: Import stage for harvest object: %s', self.harvester_name(), harvest_object.id) if not harvest_object: log.error('No harvest object received') return False if not harvest_object.content: log.error('Harvest object contentless') self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_source_config(harvest_object.source.config) status = self._get_object_extra(harvest_object, 'status') # Get the last harvested object (if any) previous_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if status == 'delete': # Delete package p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True # Flag previous object as not current anymore if previous_object: previous_object.current = False previous_object.add() # Flag this object as the current one harvest_object.current = True harvest_object.add() # Generate GUID if not present (i.e. it's a manual import) if not harvest_object.guid: self._save_object_error( 'Missing GUID for object {0}'.format(harvest_object.id), harvest_object, 'Import') return False # pre-check to skip resource logic in case no changes occurred remotely if status == 'change': # Check if the document has changed m = hashlib.md5() m.update(previous_object.content.encode()) old_md5 = m.hexdigest() m = hashlib.md5() m.update(harvest_object.content.encode()) new_md5 = m.hexdigest() if old_md5 == new_md5: # Assign the previous job id to the new object to # avoid losing history harvest_object.harvest_job_id = previous_object.job.id harvest_object.add() harvest_object.metadata_modified_date = previous_object.metadata_modified_date harvest_object.add() # Delete the previous object to avoid cluttering the object table previous_object.delete() # Reindex the corresponding package to update the reference to the harvest object context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')( context, { 'id': harvest_object.package_id }) except p.toolkit.ObjectNotFound: pass else: for extra in package_dict.get('extras', []): if extra['key'] == 'harvest_object_id': extra['value'] = harvest_object.id if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) log.info('%s document with GUID %s unchanged, skipping...', self.harvester_name(), harvest_object.guid) model.Session.commit() return "unchanged" # Build the package dict package_dict, metadata = self.create_package_dict( harvest_object.guid, harvest_object.content) if not package_dict: log.error( 'No package dict returned, aborting import for object {0}'. format(harvest_object.id)) return False package_dict['name'] = self._gen_new_name(package_dict['title']) # We need to get the owner organization (if any) from the harvest source dataset source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org self.attach_resources(metadata, package_dict, harvest_object) # Create / update the package context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), 'extras_as_string': True, 'api_version': '2', 'return_id_only': True } if context['user'] == self._site_user['name']: context['ignore_auth'] = True # The default package schema does not like Upper case tags tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty] if status == 'new': package_schema = logic.schema.default_create_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema # We need to explicitly provide a package ID, otherwise ckanext-spatial # won't be be able to link the extent to the package. package_dict['id'] = uuid.uuid4().hex package_schema['id'] = [] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() try: package_id = p.toolkit.get_action('package_create')( context, package_dict) log.info('%s: Created new package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False elif status == 'change': # we know the internal document did change, bc of a md5 hash comparison done above package_schema = logic.schema.default_update_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema package_dict['id'] = harvest_object.package_id try: package_id = p.toolkit.get_action('package_update')( context, package_dict) log.info('%s updated package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False model.Session.commit() return True
def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) # Get default values. harvester_config = self.load_config(harvest_object.source) # Get the metadata that we stored in the HarvestObject's content field. dataset = json.loads(harvest_object.content) # We need to get the owner organization (if any) from the harvest # source dataset owner_org = None source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org # Assemble basic information about the dataset. pkg = { "name": self.make_package_name(dataset["title"], harvest_object.guid, False), "state": "active", # in case was previously deleted "owner_org": owner_org, "extras": [{ "key": "source_url", "value": harvest_object.source.url, }, { "key": "source_title", "value": harvest_object.source.title, }, { "key": "source_identifier", "value": dataset["identifier"], }, { "key": "source_hash", "value": self.make_upstream_content_hash(dataset, harvest_object.source), }, { "key": "harvest_harvester_version", "value": self.HARVESTER_VERSION, }, { "key": "harvest_last_updated", "value": datetime.datetime.utcnow().isoformat(), }] } # Set default values from the harvester configuration. Do this before # applying values from the harvest source so that the values can be # overridden. self.set_extras(pkg, harvester_config["defaults"]) # Set specific information about the dataset. self.set_dataset_info(pkg, dataset, harvester_config) # Set "overrides" values from the harvester configuration, overriding # anything found in the harvester source. self.set_extras(pkg, harvester_config["overrides"]) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] existing_pkg.update( pkg ) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by( package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() # Now that the package and the harvest source are associated, re-index the # package so it knows it is part of the harvest source. The CKAN harvester # does this by creating the association before the package is saved by # overriding the GUID creation on a new package. That's too difficult. # So here we end up indexing twice. PackageSearchIndex().index_package(pkg) return True
class DatasetHarvesterBase(HarvesterBase): ''' A Harvester for datasets. ''' # SUBCLASSES MUST IMPLEMENT #HARVESTER_VERSION = "1.0" #def info(self): # return { # 'name': 'harvester_base', # 'title': 'Base Harvester', # 'description': 'Abstract base class for harvesters that pull in datasets.', # } def validate_config(self, config): if not config: return config config_obj = yaml.load(config) return config def context(self): # Reusing the dict across calls to action methods can be dangerous, so # create a new dict every time we need it. # Setting validate to False is critical for getting the harvester plugin # to set extra fields on the package during indexing (see ckanext/harvest/plugin.py # line 99, https://github.com/okfn/ckanext-harvest/blob/master/ckanext/harvest/plugin.py#L99). return {"user": "******", "ignore_auth": True, "validate": False} # SUBCLASSES MUST IMPLEMENT def load_remote_catalog(self, harvest_job): # Loads a remote data catalog. This function must return a JSON-able # list of dicts, each dict a dataset containing an 'identifier' field # with a locally unique identifier string and a 'title' field. raise Exception("Not implemented") def gather_stage(self, harvest_job): # The gather stage scans a remote resource (like a /data.json file) for # a list of datasets to import. log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) # Start gathering. source = self.load_remote_catalog(harvest_job) if len(source) == 0: return [] # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. existing_datasets = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue sid = self.find_extra(pkg, "source_identifier") if sid: existing_datasets[sid] = pkg # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() for dataset in source: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg = existing_datasets[dataset["identifier"]] pkg_id = pkg["id"] seen_datasets.add(dataset['identifier']) # We store a hash of the dict associated with this dataset # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source): continue else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. obj = HarvestObject( guid=pkg_id, job=harvest_job, content=json.dumps(dataset, sort_keys=True) ) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() object_ids.append(obj.id) # Remove packages no longer in the remote catalog. for upstreamid, pkg in existing_datasets.items(): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" pkg["name"] = self.make_package_name( pkg["title"], pkg["id"], True ) # try to prevent name clash by giving it a "deleted-" name log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) return object_ids def fetch_stage(self, harvest_object): # Nothing to do in this stage because we captured complete # dataset metadata from the first request to the remote catalog file. return True # SUBCLASSES MUST IMPLEMENT def set_dataset_info(self, pkg, dataset, dataset_defaults): # Sets package metadata on 'pkg' using the remote catalog's metadata # in 'dataset' and default values as configured in 'dataset_defaults'. raise Exception("Not implemented.") def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) # Get default values. dataset_defaults = None try: source_config = yaml.load(harvest_object.source.config) try: dataset_defaults = source_config["defaults"] except TypeError: pass except KeyError: pass except Exception, e: print e if not dataset_defaults: dataset_defaults = {} # Get the metadata that we stored in the HarvestObject's content field. h = HTMLParser.HTMLParser() dataset = json.loads(h.unescape(harvest_object.content)) # We need to get the owner organization (if any) from the harvest # source dataset owner_org = None source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org # Assemble basic information about the dataset. pkg = { "name": self.make_package_name(dataset["title"], harvest_object.guid, False), "state": "active", # in case was previously deleted "owner_org": owner_org, "extras": [{ "key": "source_url", "value": harvest_object.source.url, }, { "key": "source_title", "value": harvest_object.source.title, }, { "key": "source_identifier", "value": dataset["identifier"], }, { "key": "source_hash", "value": self.make_upstream_content_hash(dataset, harvest_object.source), }, { "key": "harvest_harvester_version", "value": self.HARVESTER_VERSION, }] } # Set specific information about the dataset. self.set_dataset_info(pkg, dataset, dataset_defaults) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] existing_pkg.update( pkg ) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by( package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() # Now that the package and the harvest source are associated, re-index the # package so it knows it is part of the harvest source. The CKAN harvester # does this by creating the association before the package is saved by # overriding the GUID creation on a new package. That's too difficult. # So here we end up indexing twice. PackageSearchIndex().index_package(pkg) return True
def get_package_search_index(self): if not self.package_index: self.package_index = PackageSearchIndex() return self.package_index
def harvest_jobs_run(context, data_dict): log.info("Harvest job run: %r", data_dict) check_access("harvest_jobs_run", context, data_dict) session = context["session"] source_id = data_dict.get("source_id", None) if not source_id: _make_scheduled_jobs(context, data_dict) context["return_objects"] = False # Flag finished jobs as such jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job["gather_finished"]: objects = ( session.query(HarvestObject.id) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR"))) .order_by(HarvestObject.import_finished.desc()) ) if objects.count() == 0: job_obj = HarvestJob.get(job["id"]) job_obj.status = u"Finished" last_object = ( session.query(HarvestObject) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(HarvestObject.import_finished != None) .order_by(HarvestObject.import_finished.desc()) .first() ) if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"}) if len(jobs) == 0: log.info("No new harvest jobs.") raise Exception("There are no new harvesting jobs") # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context["detailed"] = False source = harvest_source_show(context, {"id": job["source_id"]}) if source["active"]: job_obj = HarvestJob.get(job["id"]) job_obj.status = job["status"] = u"Running" job_obj.save() publisher.send({"harvest_job_id": job["id"]}) log.info("Sent job %s to the gather queue" % job["id"]) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) session = context['session'] source_id = data_dict.get('source_id',None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string'in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, {'id': job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def import_stage(self, harvest_object): context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), } log = logging.getLogger(__name__ + '.import') log.debug('Import stage for harvest object: %s', harvest_object.id) if not harvest_object: log.error('No harvest object received') return False self._set_source_config(harvest_object.source.config) if self.force_import: status = 'change' else: status = self._get_object_extra(harvest_object, 'status') # Get the last harvested object (if any) previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True).first() # noqa if status == 'delete': # Delete package context.update({ 'ignore_auth': True, }) if harvest_object.package_id: p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True # Check if it is a non ISO document original_document = self._get_object_extra(harvest_object, 'original_document') original_format = self._get_object_extra(harvest_object, 'original_format') if original_document and original_format: # DEPRECATED use the ISpatialHarvester interface method self.__base_transform_to_iso_called = False content = self.transform_to_iso(original_document, original_format, harvest_object) if not self.__base_transform_to_iso_called: log.warn( 'Deprecation warning: calling transform_to_iso directly is deprecated. ' + 'Please use the ISpatialHarvester interface method instead.' ) for harvester in p.PluginImplementations(ISpatialHarvester): content = harvester.transform_to_iso(original_document, original_format, harvest_object) if content: harvest_object.content = content else: self._save_object_error('Transformation to ISO failed', harvest_object, 'Import') return False else: if harvest_object.content is None: self._save_object_error( 'Empty content for object {0}'.format(harvest_object.id), harvest_object, 'Import') return False # Validate ISO document is_valid, profile, errors = self._validate_document( harvest_object.content, harvest_object) if not is_valid: # If validation errors were found, import will stop unless # configuration per source or per instance says otherwise continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors', False)) or \ self.source_config.get('continue_on_validation_errors') if not continue_import: return False # Parse ISO document try: iso_parser = ISODocument(harvest_object.content) iso_values = iso_parser.read_values() except Exception as e: self._save_object_error( 'Error parsing ISO document for object {0}: {1}'.format( harvest_object.id, six.text_type(e)), harvest_object, 'Import') return False # Flag previous object as not current anymore if previous_object and not self.force_import: previous_object.current = False previous_object.add() # Update GUID with the one on the document iso_guid = iso_values['guid'] if iso_guid and harvest_object.guid != iso_guid: # First make sure there already aren't current objects # with the same guid existing_object = model.Session.query(HarvestObject.id) \ .filter(HarvestObject.guid == iso_guid) \ .filter(HarvestObject.current == True).first() # noqa if existing_object: self._save_object_error( 'Object {0} already has this guid {1}'.format( existing_object.id, iso_guid), harvest_object, 'Import') return False harvest_object.guid = iso_guid harvest_object.add() # Generate GUID if not present (i.e. it's a manual import) if not harvest_object.guid: m = hashlib.md5() m.update(harvest_object.content.encode('utf8', 'ignore')) harvest_object.guid = m.hexdigest() harvest_object.add() # Get document modified date try: metadata_modified_date = dateutil.parser.parse( iso_values['metadata-date'], ignoretz=True) except ValueError: self._save_object_error( 'Could not extract reference date for object {0} ({1})'.format( harvest_object.id, iso_values['metadata-date']), harvest_object, 'Import') return False harvest_object.metadata_modified_date = metadata_modified_date harvest_object.add() # Build the package dict package_dict = self.get_package_dict(iso_values, harvest_object) for harvester in p.PluginImplementations(ISpatialHarvester): package_dict = harvester.get_package_dict( context, { 'package_dict': package_dict, 'iso_values': iso_values, 'xml_tree': iso_parser.xml_tree, 'harvest_object': harvest_object, }) if not package_dict: log.error( 'No package dict returned, aborting import for object {0}'. format(harvest_object.id)) return False # Create / update the package context.update({ 'extras_as_string': True, 'api_version': '2', 'return_id_only': True }) if self._site_user and context['user'] == self._site_user['name']: context['ignore_auth'] = True # The default package schema does not like Upper case tags tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty, six.text_type] # Flag this object as the current one harvest_object.current = True harvest_object.add() if status == 'new': package_schema = logic.schema.default_create_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema # We need to explicitly provide a package ID, otherwise ckanext-spatial # won't be be able to link the extent to the package. package_dict['id'] = six.text_type(uuid.uuid4()) package_schema['id'] = [six.text_type] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() try: package_id = p.toolkit.get_action('package_create')( context, package_dict) log.info('Created new package %s with guid %s', package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % six.text_type(e.error_summary), harvest_object, 'Import') return False elif status == 'change': # Check if the modified date is more recent if not self.force_import and previous_object \ and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date: # Assign the previous job id to the new object to # avoid losing history harvest_object.harvest_job_id = previous_object.job.id harvest_object.add() # Delete the previous object to avoid cluttering the object table previous_object.delete() # Reindex the corresponding package to update the reference to the # harvest object if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False' or self.source_config.get('reindex_unchanged') != 'False') and harvest_object.package_id): context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')( context, { 'id': harvest_object.package_id }) except p.toolkit.ObjectNotFound: pass else: for extra in package_dict.get('extras', []): if extra['key'] == 'harvest_object_id': extra['value'] = harvest_object.id if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid)) else: package_schema = logic.schema.default_update_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema package_dict['id'] = harvest_object.package_id try: package_id = p.toolkit.get_action('package_update')( context, package_dict) log.info('Updated package %s with guid %s', package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % six.text_type(e.error_summary), harvest_object, 'Import') return False model.Session.commit() return True
def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) if (harvest_object.content == None): return True dataset = json.loads(harvest_object.content) schema_version = '1.0' # default to '1.0' is_collection = False parent_pkg_id = '' catalog_extras = {} for extra in harvest_object.extras: if extra.key == 'schema_version': schema_version = extra.value if extra.key == 'is_collection' and extra.value: is_collection = True if extra.key == 'collection_pkg_id' and extra.value: parent_pkg_id = extra.value if extra.key.startswith('catalog_'): catalog_extras[extra.key] = extra.value # if this dataset is part of collection, we need to check if # parent dataset exist or not. we dont support any hierarchy # in this, so the check does not apply to those of is_collection if parent_pkg_id and not is_collection: parent_pkg = None try: parent_pkg = get_action('package_show')(self.context(), { "id": parent_pkg_id }) except: pass if not parent_pkg: parent_check_message = "isPartOf identifer '%s' not found." \ % dataset.get('isPartOf') self._save_object_error(parent_check_message, harvest_object, 'Import') return None # Get default values. dataset_defaults = self.load_config(harvest_object.source)["defaults"] source_config = json.loads(harvest_object.source.config or '{}') validator_schema = source_config.get('validator_schema') if schema_version == '1.0' and validator_schema != 'non-federal': lowercase_conversion = True else: lowercase_conversion = False MAPPING = { "title": "title", "description": "notes", "keyword": "tags", "modified": "extras__modified", # ! revision_timestamp "publisher": "extras__publisher", # !owner_org "contactPoint": "maintainer", "mbox": "maintainer_email", "identifier": "extras__identifier", # !id "accessLevel": "extras__accessLevel", "bureauCode": "extras__bureauCode", "programCode": "extras__programCode", "accessLevelComment": "extras__accessLevelComment", "license": "extras__license", # !license_id "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing "temporal": "extras__temporal", "theme": "extras__theme", "dataDictionary": "extras__dataDictionary", # !data_dict "dataQuality": "extras__dataQuality", "accrualPeriodicity": "extras__accrualPeriodicity", "landingPage": "extras__landingPage", "language": "extras__language", "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII "references": "extras__references", "issued": "extras__issued", "systemOfRecords": "extras__systemOfRecords", "accessURL": None, "webService": None, "format": None, "distribution": None, } MAPPING_V1_1 = { "title": "title", "description": "notes", "keyword": "tags", "modified": "extras__modified", # ! revision_timestamp "publisher": "extras__publisher", # !owner_org "contactPoint": { "fn": "maintainer", "hasEmail": "maintainer_email" }, "identifier": "extras__identifier", # !id "accessLevel": "extras__accessLevel", "bureauCode": "extras__bureauCode", "programCode": "extras__programCode", "rights": "extras__rights", "license": "extras__license", # !license_id "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing "temporal": "extras__temporal", "theme": "extras__theme", "dataDictionary": "extras__dataDictionary", # !data_dict "dataQuality": "extras__dataQuality", "accrualPeriodicity": "extras__accrualPeriodicity", "landingPage": "extras__landingPage", "language": "extras__language", "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII "references": "extras__references", "issued": "extras__issued", "systemOfRecords": "extras__systemOfRecords", "distribution": None, } SKIP = ["accessURL", "webService", "format", "distribution"] # will go into pkg["resources"] # also skip the processed_how key, it was added to indicate how we processed the dataset. SKIP.append("processed_how") SKIP_V1_1 = ["@type", "isPartOf", "distribution"] SKIP_V1_1.append("processed_how") if lowercase_conversion: mapping_processed = {} for k, v in MAPPING.items(): mapping_processed[k.lower()] = v skip_processed = [k.lower() for k in SKIP] dataset_processed = {'processed_how': ['lowercase']} for k, v in dataset.items(): if k.lower() in mapping_processed.keys(): dataset_processed[k.lower()] = v else: dataset_processed[k] = v if 'distribution' in dataset and dataset[ 'distribution'] is not None: dataset_processed['distribution'] = [] for d in dataset['distribution']: d_lower = {} for k, v in d.items(): if k.lower() in mapping_processed.keys(): d_lower[k.lower()] = v else: d_lower[k] = v dataset_processed['distribution'].append(d_lower) else: dataset_processed = dataset mapping_processed = MAPPING skip_processed = SKIP if schema_version == '1.1': mapping_processed = MAPPING_V1_1 skip_processed = SKIP_V1_1 validate_message = self._validate_dataset(validator_schema, schema_version, dataset_processed) if validate_message: self._save_object_error(validate_message, harvest_object, 'Import') return None # We need to get the owner organization (if any) from the harvest # source dataset owner_org = None source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org source_config = json.loads(harvest_object.source.config or '{}') group_name = source_config.get('default_groups', '') # Assemble basic information about the dataset. pkg = { "state": "active", # in case was previously deleted "owner_org": owner_org, "groups": [{ "name": group_name }], "resources": [], "extras": [ { "key": "resource-type", "value": "Dataset", }, { "key": "source_hash", "value": self.make_upstream_content_hash(dataset, harvest_object.source, catalog_extras, schema_version), }, { "key": "source_datajson_identifier", "value": True, }, { "key": "harvest_source_id", "value": harvest_object.harvest_source_id, }, { "key": "harvest_object_id", "value": harvest_object.id, }, { "key": "harvest_source_title", "value": harvest_object.source.title, }, { "key": "source_schema_version", "value": schema_version, }, ] } extras = pkg["extras"] unmapped = [] for key, value in dataset_processed.iteritems(): if key in skip_processed: continue new_key = mapping_processed.get(key) if not new_key: unmapped.append(key) continue # after schema 1.0+, we need to deal with multiple new_keys new_keys = [] values = [] if isinstance(new_key, dict): # when schema is not 1.0 _new_key_keys = new_key.keys() new_keys = new_key.values() values = [] for _key in _new_key_keys: values.append(value.get(_key)) else: new_keys.append(new_key) values.append(value) if not any(item for item in values): continue mini_dataset = dict(zip(new_keys, values)) for mini_key, mini_value in mini_dataset.iteritems(): if not mini_value: continue if mini_key.startswith('extras__'): extras.append({"key": mini_key[8:], "value": mini_value}) else: pkg[mini_key] = mini_value # pick a fix number of unmapped entries and put into extra if unmapped: unmapped.sort() del unmapped[100:] for key in unmapped: value = dataset_processed.get(key, "") if value is not None: extras.append({"key": key, "value": value}) # if theme is geospatial/Geospatial, we tag it in metadata_type. themes = self.find_extra(pkg, "theme") if themes and ('geospatial' in [x.lower() for x in themes]): extras.append({'key': 'metadata_type', 'value': 'geospatial'}) if is_collection: extras.append({'key': 'collection_metadata', 'value': 'true'}) elif parent_pkg_id: extras.append({ 'key': 'collection_package_id', 'value': parent_pkg_id }) for k, v in catalog_extras.iteritems(): extras.append({'key': k, 'value': v}) # Set specific information about the dataset. self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] pkg['groups'] = existing_pkg['groups'] existing_pkg.update( pkg ) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. pkg['name'] = self.make_package_name(dataset_processed["title"], harvest_object.guid) try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by( package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() # Now that the package and the harvest source are associated, re-index the # package so it knows it is part of the harvest source. The CKAN harvester # does this by creating the association before the package is saved by # overriding the GUID creation on a new package. That's too difficult. # So here we end up indexing twice. PackageSearchIndex().index_package(pkg) return True