def update_extents(): from ckan.model import PackageExtra, Package, Session conn = Session.connection() packages = [extra.package \ for extra in \ Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()] errors = [] count = 0 for package in packages: try: value = package.extras['spatial'] log.debug('Received: %r' % value) geometry = json.loads(value) count += 1 except ValueError as e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, six.text_type(e))) except TypeError as e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, six.text_type(e))) save_package_extent(package.id, geometry) Session.commit() if errors: msg = 'Errors were found:\n%s' % '\n'.join(errors) print(msg) msg = "Done. Extents generated for %i out of %i packages" % (count, len(packages)) print(msg)
def _execute_sql(cls, script): engine = create_engine(cls.sqlalchemy_url) Session.bind = engine connection = Session.connection() connection.execute(script) Session.commit()
def _execute_script(script_path): conn = Session.connection() script = open(script_path, 'r').read() for cmd in script.split(';'): cmd = re.sub(r'--(.*)|[\n\t]', '', cmd) if len(cmd): conn.execute(cmd) Session.commit()
def _execute_script(script_path): conn = Session.connection() script = open(script_path, "r").read() for cmd in script.split(";"): cmd = re.sub(r"--(.*)|[\n\t]", "", cmd) if len(cmd): conn.execute(cmd) Session.commit()
def setup_postgis_tables(): conn = Session.connection() script_path = os.path.join(os.path.dirname(os.path.abspath( __file__ )), 'scripts', 'postgis.sql') script = open(script_path,'r').read() for cmd in script.split(';'): cmd = re.sub(r'--(.*)|[\n\t]','',cmd) if len(cmd): conn.execute(cmd) Session.commit()
def setup(srid=None): if not srid: srid = DEFAULT_SRID srid = str(srid) connection = Session.connection() connection.execute('CREATE TABLE package_extent(package_id text PRIMARY KEY)') connection.execute('SELECT AddGeometryColumn(\'package_extent\',\'the_geom\', %s, \'GEOMETRY\', 2)',srid) Session.commit()
def delete_vocabulary(id, cascade=True): """ Delete a vocabulary, by id :param id: vocabulary id :param cascade: if True, delete all tags in this vocabulary first """ conn = Session.connection() with conn.begin(): if cascade: query = delete(tag_table).where(tag_table.c.vocabulary_id == id) query.execute() query = delete(vocabulary_table).where(vocabulary_table.c.id == id)
def get_local_datasets_for_portal(self, context, original_portal): log.info(">>>> Got portal: "+original_portal) conn = Session.connection() package_table = self.table('package') package_extras_table = self.table('package_extra') #select name from package where id in (select package_id from package_extra where (value='"http://daten.rlp.de"' AND package_id in (SELECT id from package where state='active'))); get_active_packages = select([package_table.c.id]).where(package_table.c.state=='active') filtered = select([package_extras_table.c.package_id]).where(and_(package_extras_table.c.key=='metadata_original_portal',and_(package_extras_table.c.value==original_portal,package_extras_table.c.package_id.in_(get_active_packages)))) get_names_of_filtered = select([package_table.c.name]).where(package_table.c.id.in_(filtered)) result = model.Session.execute(get_names_of_filtered).fetchall() results = [row['name'] for row in result] log.info('Found %d Datasets for Portal' %len(results)) return results
def _execute_script(script_path): ''' :param script_path: ''' conn = Session.connection() script = open(script_path, u'r').read() for cmd in script.split(u';'): cmd = re.sub(r'--(.*)|[\n\t]', u'', cmd) if len(cmd): conn.execute(cmd) Session.commit()
def update_extents(self): from ckan.model import PackageExtra, Package, Session conn = Session.connection() packages = [extra.package \ for extra in \ Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()] errors = [] count = 0 for package in packages: try: value = package.extras['spatial'] log.debug('Received: %r' % value) geometry = json.loads(value) count += 1 except ValueError,e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id,str(e))) except TypeError,e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id,str(e)))
def update_extents(self): from ckan.model import PackageExtra, Package, Session conn = Session.connection() packages = [extra.package \ for extra in \ Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()] errors = [] count = 0 for package in packages: try: value = package.extras['spatial'] log.debug('Received: %r' % value) geometry = json.loads(value) count += 1 except ValueError, e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, str(e))) except TypeError, e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, str(e)))
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. ''' try: # Change default schema schema = default_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: api_version = self.config.get('api_version','2') #TODO: use site user when available user_name = self.config.get('user',u'harvest') else: api_version = '2' user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return except NotFound: # Package needs to be created # Check if name has not already been used package_dict['name'] = self._check_name(package_dict['name']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) new_package = get_action('package_create_rest')(context, package_dict) harvest_object.package_id = new_package['id'] # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) Session.commit() # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. ''' try: # Change default schema schema = default_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: api_version = self.config.get('api_version', '2') #TODO: use site user when available user_name = self.config.get('user', u'harvest') else: api_version = '2' user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) new_package = get_action('package_update_rest')( context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return except NotFound: # Package needs to be created # Check if name has not already been used package_dict['name'] = self._check_name(package_dict['name']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) new_package = get_action('package_create_rest')(context, package_dict) harvest_object.package_id = new_package['id'] # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) Session.commit() # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() return True except ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, six.text_type] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) package_dict['tags'] = self._clean_tags(tags) # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if 'metadata_modified' not in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified') or package_dict['name'] == "status-of-covid-19-cases-in-ontario-by-public-health-unit-phu" or package_dict['id'] == 'ecb75ea0-8b72-4f46-a14a-9bd54841d6ab': log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) ''' what we want to do here is - not overwrite maintainer name or maintainer email or maintainer branch with blank information - not include resources because it will overwrite the existing resources - match owner_org - not overwrite all keywords (just add) ''' package_dict['keywords'] = { "en": list( set(existing_package_dict['keywords']['en'] + package_dict['keywords']['en'])), "fr": list( set(existing_package_dict['keywords']['fr'] + package_dict['keywords']['fr'])) } package_dict['owner_org'] = package_dict['organization'][ 'name'] package_dict['harvester'] = "ontario-data-catalogue" if package_dict.get("maintainer_email", "") == "": del package_dict['maintainer_email'] if "maintainer_translated" in package_dict: if package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") == "": del package_dict['maintainer_translated'] elif package_dict['maintainer_translated'].get( "en", "" ) != "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'][ 'fr'] = package_dict['maintainer_translated'][ 'en'] elif package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") != "": package_dict['maintainer_translated'][ 'en'] = package_dict['maintainer_translated'][ 'fr'] if "maintainer_branch" in package_dict: if package_dict['maintainer_branch'].get( "en", "" ) == "" and package_dict['maintainer_branch'].get( "fr", "") == "": del package_dict['maintainer_branch'] if 'resources' in package_dict: for resource in package_dict['resources']: resource.update({"harvested_resource": True}) resource_context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'id': resource['id'], 'ignore_auth': True, } p.toolkit.get_action( "resource_patch" if resource['id'] in list( map(lambda x: x["id"], existing_package_dict["resources"]) ) else "resource_create")(resource_context, resource) list_of_remote_resources = list( map(lambda x: x["id"], package_dict["resources"])) for resource in list( filter( lambda x: x["harvested_resource"] == True, existing_package_dict["resources"])): # if there's a harvested resource locally that isn't in the latest harvested list of resources, delete it if resource['id'] not in list_of_remote_resources: resource_context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'id': resource['id'], 'ignore_auth': True, } p.toolkit.get_action("resource_delete")( resource_context, { 'id': resource['id'] }) del package_dict['resources'] new_package = p.toolkit.get_action("package_patch")( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table)\ .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() package_dict['owner_org'] = package_dict['organization'][ 'name'] package_dict['harvester'] = "ontario-data-catalogue" for resource in package_dict['resources']: resource.update({"harvested_resource": True}) if package_dict.get("maintainer_email", "") == "": package_dict['maintainer_email'] = "*****@*****.**" if "maintainer_translated" in package_dict: if package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'] = { "en": "Open Data", "fr": "Données ouvertes" } elif package_dict['maintainer_translated'].get( "en", "" ) != "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'][ 'fr'] = package_dict['maintainer_translated']['en'] elif package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") != "": package_dict['maintainer_translated'][ 'en'] = package_dict['maintainer_translated']['fr'] else: package_dict['maintainer_translated'] = { "en": "Open Data", "fr": "Données ouvertes" } new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError as e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') except Exception as e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import') return None
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' log.debug('_create_or_update_package') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') #TODO: use site user when available user_name = self.config.get('user', u'harvest') else: api_version = 2 user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags] tags = list(set(tags)) package_dict['tags'] = tags #log.debug('tag tag tag tag') #log.debug(tag) # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Check if name has not already been used package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')(context, package_dict) else: log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')