def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [toolkit.get_action('organization_show')( data_dict={'id': org_name}) for org_name in org_names] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [ toolkit.get_action('organization_show')(data_dict={ 'id': org_name }) for org_name in org_names ] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def run(cls, config_ini_or_ckan_url, dataset_names): ckan = common.get_ckanapi(config_ini_or_ckan_url) stats = Stats() for dataset_name in dataset_names: try: ckan.call_action('dataset_delete', {'id': dataset_name}) print stats.add('Deleted (or was already deleted)', dataset_name) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if 'CKANAPIError' in str(e): print e print 'Not calling API correctly - aborting' sys.exit(1) print stats.add('Error %s' % type(e).__name__, '%s %s' % (dataset_name, e))
def run(cls, config_ini_or_ckan_url, dataset_names): ckan = common.get_ckanapi(config_ini_or_ckan_url) stats = Stats() for dataset_name in dataset_names: dataset_name = common.name_stripped_of_url(dataset_name) try: ckan.call_action('dataset_delete', {'id': dataset_name}) print stats.add('Deleted (or was already deleted)', dataset_name) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if 'CKANAPIError' in str(e): print e print 'Not calling API correctly - aborting' sys.exit(1) print stats.add('Error %s' % type(e).__name__, '%s %s' % (dataset_name, e))
def dgu_update(apikey): from ckanext.dgu.forms import validators import ckanapi dgu = ckanapi.RemoteCKAN('http://data.gov.uk', user_agent=__file__, apikey=apikey) dgu_categories = dict(validators.categories) dgu_categories_by_title = dict( (title, id) for id, title in validators.categories) stats_category = Stats() stats_state = Stats() org_names_request = requests.get( 'http://data.gov.uk/api/action/organization_list') # NB Not using all_fields as it doesn't include extras, like category org_names = json.loads(org_names_request.content)['result'] opennames = nomenklatura.Dataset('public-bodies-uk') for org_name in org_names: org_request = requests.get( 'http://data.gov.uk/api/action/organization_show?id=%s' % org_name) org = json.loads(org_request.content)['result'] # convert the extras into a dict org['extras'] = dict( (extra['key'], extra['value']) for extra in org['extras']) try: entity = opennames.entity_by_name(org['title']) except NoMatch: # BTW it hasn't been added for review msg = 'Org not found in nomenklatura' print stats_category.add(msg, org_name) stats_state.add(msg, org_name) continue entity = entity.dereference() changed_org = dgu_update_category(org_name, org, entity, stats_category, dgu_categories, dgu_categories_by_title) if changed_org: # convert the extras back into a list of dicts org['extras'] = [{ 'key': key, 'value': value } for key, value in org['extras'].items()] try: org = dgu.action.organization_update(**org) except ckanapi.errors.CKANAPIError, e: if '504 Gateway Time-out' in str(e): print stats_category.add('Time-out writing', org_name) else: raise
def dgu_update(apikey): from ckanext.dgu.forms import validators import ckanapi dgu = ckanapi.RemoteCKAN("http://data.gov.uk", user_agent=__file__, apikey=apikey) dgu_categories = dict(validators.categories) dgu_categories_by_title = dict((title, id) for id, title in validators.categories) stats_category = Stats() stats_state = Stats() org_names_request = requests.get("http://data.gov.uk/api/action/organization_list") # NB Not using all_fields as it doesn't include extras, like category org_names = json.loads(org_names_request.content)["result"] opennames = nomenklatura.Dataset("public-bodies-uk") for org_name in org_names: org_request = requests.get("http://data.gov.uk/api/action/organization_show?id=%s" % org_name) org = json.loads(org_request.content)["result"] # convert the extras into a dict org["extras"] = dict((extra["key"], extra["value"]) for extra in org["extras"]) try: entity = opennames.entity_by_name(org["title"]) except NoMatch: # BTW it hasn't been added for review msg = "Org not found in nomenklatura" print stats_category.add(msg, org_name) stats_state.add(msg, org_name) continue entity = entity.dereference() changed_org = dgu_update_category( org_name, org, entity, stats_category, dgu_categories, dgu_categories_by_title ) if changed_org: # convert the extras back into a list of dicts org["extras"] = [{"key": key, "value": value} for key, value in org["extras"].items()] try: org = dgu.action.organization_update(**org) except ckanapi.errors.CKANAPIError, e: if "504 Gateway Time-out" in str(e): print stats_category.add("Time-out writing", org_name) else: raise
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend([dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date(resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add('Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation'] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % (i+1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % (field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id']) date_format = {'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY'} input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print [res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace()
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend( [dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add( 'Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date( resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add( 'Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [ res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation' ] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % ( i + 1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % ( field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % ( res['dataset_name'], res['id']) date_format = { 'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY' } input_ = raw_input( 'Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print[res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace() # Remove 'dataset_name' and others fields from resources ignore_res_fields = set( ('dataset_name', 'created', 'position', 'revision_id', 'id', 'tracking_summary', 'qa', 'archiver')) for res in resources: for field in ignore_res_fields & set(res.keys()): del res[field] # Merge dataset fields def get_all_fields_and_values(datasets): ignore_fields = set(( 'id', 'resources', 'last_major_modification', 'data_dict', 'revision_timestamp', 'num_tags', 'metadata_created', 'metadata_modified', 'odi_certificate', 'extras', # they are at top level already 'timeseries_resources', 'individual_resources', 'additional_resources', 'revision_id', 'organization', 'tracking_summary', 'num_resources', 'license_title', 'author', 'author_email', 'maintainer', 'maintainer_email', 'temporal_granularity', 'geographic_granularity', 'state', 'isopen', 'url', 'date_update_future', 'date_updated', 'date_released', 'precision', 'taxonomy_url', 'temporal_coverage-from', 'temporal_coverage-to', 'published_via', 'creator_user_id', 'qa', 'archiver', )) first_fields = [ 'title', 'name', 'notes', 'theme-primary', 'theme-secondary' ] all_field_values = defaultdict(list) for dataset in datasets: for field in dataset: if field not in ignore_fields and dataset[field]: all_field_values[field].append(dataset[field]) for field in first_fields: yield field, all_field_values.get(field, []) for field in all_field_values: if field not in first_fields: yield field, all_field_values[field] spend_data_defaults = { 'geographic_coverage': None, 'theme-primary': 'Government Spending', 'theme-secondary': None, 'update_frequency': 'monthly', } combined_dataset = {'resources': resources} all_fields_and_values = get_all_fields_and_values(datasets) for field, values in all_fields_and_values: if field == 'notes': values = [value.strip() for value in values] if field == 'tags': # just merge them up-front and # dont offer user any choice tags_by_name = {} for dataset_tags in values: for tag in dataset_tags: if tag['name'] not in tags_by_name: tags_by_name[tag['name']] = tag values = [tags_by_name.values()] if field in ('codelist', 'schema'): # just merge them up-front # And convert the dict into just an id string ids = set() for dataset_values in values: for value_dict in dataset_values: ids.add(value_dict['id']) values = [list(ids)] print '\n%s:' % field pprint(list(enumerate(values))) if options.spend and field in spend_data_defaults: value = spend_data_defaults[field] print 'Spend data defaults to: %s' % value values = [value] if value is not None else None # dont be case-sensitive for boolean fields if field == 'core-dataset': values = [v.lower() for v in values] try: values_identicle = len(set(values)) == 1 except TypeError: if values and len(values): val1 = values[0] for val in values[1:]: if val != val1: values_identicle = False break else: values_identicle = True if (not values) or (not len(values)): pass elif values_identicle: value = values[0] elif field == 'name': while True: from ckan.lib.munge import munge_title_to_name munged_title = munge_title_to_name( combined_dataset['title']) print munge_title_to_name( datasets[0]['organization']['title']) value = raw_input('Type new value (%s): ' % (munged_title)) if not value: value = munged_title if len(value) < 3: print 'Too short' continue if value in values: print 'That name is taken' continue existing = ckan.action.package_autocomplete(q=value) if value in existing: print 'That name is taken on CKAN' continue break else: while True: response = raw_input( '%s: value (number) or type new one: ' % field) try: value_index = int(response) value = values[value_index] print value except ValueError: # fix pound signs if the user pasted from the repr'd version response = re.sub(r'\\xa3', u'\xa3', response) value = response if not value and field in ('title', 'owner_org', 'notes', 'license_id'): print 'You must have a value for this field!' continue break if value: combined_dataset[field] = value # Store print '\nMerged dataset:\n' pprint(combined_dataset) response = raw_input( 'Press enter to write or pdb to edit in pdb first: ') if response == 'pdb': import pdb pdb.set_trace() try: if options.update: ckan.action.dataset_update(**combined_dataset) else: ckan.action.dataset_create(**combined_dataset) except Exception, e: print e import pdb pdb.set_trace()
def dgu_account(args): ckan = get_ckan(args.ckan) uploads = get_uploads() stats = Stats() publishers = {} # by email.lower() for upload in uploads: version = datetime.datetime.strptime(upload['version'], '%d/%m/%Y') if version < datetime.datetime(2015, 1, 1): #stats.add('Ignore - before 2015', # '%s %s' % (upload['version'], upload['submitter_email'])) continue if '@' not in upload['submitter_email']: stats.add('Ignore - bad email address', upload['submitter_email']) continue if upload['submitter_email'].lower() not in publishers: publishers[upload['submitter_email'].lower()] = [] stats.add('Added', upload['submitter_email']) else: stats.add('Appended', upload['submitter_email']) publishers[upload['submitter_email'].lower()].append( dict(email=upload['submitter_email'], org_name=upload['org_name'], version=version)) print 'Email addresses:' print stats cache_filename = '.users.%s.cache' % (args.ckan.replace(':', '-')) if os.path.exists(cache_filename): print 'Getting users from %s' % cache_filename with open(cache_filename, 'rb') as f: users_str = f.read() users = json.loads(users_str) else: print 'Getting users from %s' % args.ckan # NB this doesn't work remotely because varnish times out, # so run from prod3 itself against 8080 from ~/organograms users = ckan.action.user_list() print 'Saving users to %s' % cache_filename users_str = json.dumps(users) with open(cache_filename, 'wb') as f: f.write(users_str) print '%s users' % len(users) users_by_email = dict([(user['email'], user) for user in users]) def get_user(email_variants): for email_variant in email_variants: if email_variant in users_by_email: return users_by_email[email_variant] stats = Stats() user_table = [] for email_lower in publishers: user_row = dict(email=email_lower) versions = (upload['version'] for upload in publishers[email_lower]) latest_version = sorted(versions)[-1] user_row['source of contact'] = '%s organogram published' \ % datetime.datetime.strftime(latest_version, '%Y-%m') # find the organization org_names_raw = set( (upload['org_name'] for upload in publishers[email_lower])) orgs = [] for org_name_raw in org_names_raw: title = canonize(org_name_raw) match = DguOrgs.by_canonized_title().get( title) or Aliases.get_from_canonized(title) assert match, 'No match: %s' % org_name_raw if isinstance(match, basestring): match = DguOrgs.by_title()[match] if match not in orgs: orgs.append(match) user_row['organization'] = ' / '.join([org['title'] for org in orgs]) # see if they are a user on data.gov.uk email_variants = set( (upload['email'] for upload in publishers[email_lower])) user = get_user(email_variants) user_table.append(user_row) emails_str = '/'.join(email_variants) if not user: user_row['has dgu login'] = '******' print stats.add('Not registered', emails_str) continue # assume has confirmed email user_row['has dgu login'] = '******' user_row['name'] = user['fullname'] user_row['email'] = user['email'] # see if this user is an editor/admin for the organization user_permissions = [] for org in orgs: editors_and_admins = (user['name'] for user in org['users']) if user['name'] in editors_and_admins: user_permissions.append('yes') print stats.add('Already an editor/admin', '%s %s' % (emails_str, org['title'])) else: user_permissions.append('no') admins = (user['name'] for user in org['users'] if user['capacity'] == 'admin') if admins: print stats.add( 'Need to get permission. Admin exists', '%s %s %s' % (emails_str, org['title'], ', '.join('"%s"' % a for a in admins))) else: print stats.add('Need to get permission. No admin', '%s %s' % (emails_str, org['title'])) user_row['editor or admin'] = ' / '.join(user_permissions) def extract_email(stat): emails = stat.split(' ')[0] # the first word email = emails.split('/')[0] # ignore variants return email print '\nFor emailing:' print '-------------' print '\nNot registered:' print ', '.join(stats['Not registered']) print '\nAlready an editor/admin:' print ', '.join([ extract_email(email_and_org) for email_and_org in stats['Already an editor/admin'] ]) print '\nNeed to get permission. Admin exists:' print ', '.join([ extract_email(email_and_org) for email_and_org in stats['Need to get permission. Admin exists'] ]) print '\nTable:' print '-------------' headers = ('name', 'email', 'organization', 'has dgu login', 'editor or admin', 'source of contact') print '\t'.join(headers) for row in user_table: print '\t'.join(row.get(header, '') for header in headers) print '\nPermissions' print '-------------' print stats
def import_(cls, csv_filepath): log = global_log from ckan import model stats_category = Stats() pub_categories = csv.reader(open(csv_filepath, 'rb')) header = pub_categories.next() assert_equal('"%s"\n' % '","'.join(header), cls.header) for id, title, parent, category, spending_published_by in pub_categories: pub = model.Session.query(model.Group).get(id) if not pub: print stats_category.add('Publisher ID not known', '%s %s' % (id, title)) continue category = category.strip() # set category existing_category = pub.extras.get('category') if not category and not existing_category: print stats_category.add('No category info - ignored', title) continue if not category and existing_category: print stats_category.add('Category deleted', '%s %s' % (existing_category, title)) rev = model.repo.new_revision() rev.author = 'script_' + __file__ pub.extras['category'] = None model.Session.commit() continue if category not in categories_dict.keys(): print stats_category.add('Category %s not known - ignored' % category, title) continue if existing_category != category: print stats_category.add('Changing category', '%s->%s %s' % (existing_category or '(none)', category, title)) rev = model.repo.new_revision() rev.author = 'script_' + __file__ pub.extras['category'] = category model.Session.commit() else: print stats_category.add('No change', '%s %s' % (existing_category or '(none)', title)) log.info('Leaving category for %r as %s', title, category) # set spending_published_by existing_spb = pub.extras.get('spending_published_by') if not spending_published_by: log.info('No spending_published_by for %r', title) continue spb_publisher = model.Group.get(spending_published_by) if not spb_publisher: spb_publisher = model.Group.search_by_name_or_title(spending_published_by) if not spb_publisher: warn('Spending_published_by not known %s - skipping %s %s', spending_published_by, id, title) import pdb; pdb.set_trace() continue spending_published_by = spb_publisher.name if existing_spb != spending_published_by: log.info('Changing SPB %r %s -> %s', title, existing_spb or '(none)', spending_published_by) model.repo.new_revision() pub.extras['spending_published_by'] = spending_published_by model.Session.commit() else: log.info('Leaving SPB for %r as %s', title, spending_published_by) model.Session.remove() print stats_category log.info('Warnings: %r', warnings)
pkg['tags'] = newtags if options.write: try: self.ckan.action.package_update(**pkg) ds_stats.add('Dataset updated', pkg['name']) except ValidationError, ve: print ds_stats.add('Validation error on update', pkg['name']) print ve except IntegrityError: print ds_stats.add('Integrity error on update', pkg['name']) else: ds_stats.add('Dataset would be updated', pkg['name']) else: ds_stats.add('No change', pkg['name']) print '\nResources:\n', res_stats.report(show_time_taken=True) print '\nDatasets:\n', ds_stats.report(show_time_taken=True) usage = __doc__ + ''' Usage: python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w''' if __name__ == '__main__': parser = OptionParser(usage=usage) parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-o', '--organization', dest='organization') parser.add_option("-w", "--write", action="store_true",
def main(source, source_type, destination, save_relevant_datasets_json, write, dataset_filter=None, res_url_filter=None): if source_type == 'json': all_datasets = get_datasets_from_json(source) elif source_type == 'jsonl': all_datasets = get_datasets_from_jsonl(source) else: all_datasets = get_datasets_from_ckan(source) datasets = [] # legacy ones revamped_datasets = [] # ones created on 3rd October 2016 launch revamped_datasets_by_org = {} revamped_resources = {} csv_out_rows = [] csv_corrected_rows = [] try: # find all the legacy organogram datasets all_datasets = list(all_datasets) # since we need to iterate it twice for dataset in all_datasets: if dataset_filter and dataset['name'] != dataset_filter: continue if res_url_filter and \ res_url_filter not in [r['url'] for r in dataset['resources']]: continue # check it an organogram dataset dataset_str = repr(dataset).lower() if 'rganog' not in dataset_str \ and 'roles and salaries' not in dataset_str \ and 'pay and post' not in dataset_str \ and 'posts and pay' not in dataset_str \ and 'organisation chart' not in dataset_str \ and 'organization chart' not in dataset_str \ and 'org chart' not in dataset_str: stats_datasets.add('Ignored - not organograms', dataset['name']) continue if dataset['name'] in ( 'eastbourne-borough-council-public-toilets', 'staff-organograms-and-pay-government-offices', ) \ or dataset['id'] in ( '47f69ebb-9939-419f-880d-1b976676cb0e', ): stats_datasets.add('Ignored - not organograms', dataset['name']) continue if asbool(dataset.get('unpublished')): stats_datasets.add('Ignored - unpublished', dataset['name']) continue extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') == 'organograms_v2': continue if extras.get('import_source') == 'harvest': stats_datasets.add('Ignored - harvested so can\'t edit it', dataset['name']) continue # legacy dataset datasets.append(dataset) # find the revamped organogram datasets for dataset in all_datasets: extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') != 'organograms_v2': continue org_id = dataset['owner_org'] revamped_datasets.append(dataset) assert org_id not in revamped_datasets_by_org, org_id revamped_datasets_by_org[org_id] = dataset for res in dataset['resources']: date = date_to_year_month(res['date']) revamped_resources[(org_id, date)] = res continue if save_relevant_datasets_json: filename = 'datasets_organograms.json' if not (dataset_filter or res_url_filter): output = json.dumps( datasets + revamped_datasets, indent=4, separators=(',', ': '), # pretty print) ) with open(filename, 'wb') as f: f.write(output) print 'Written %s' % filename else: print 'Not written %s because you filtered by a ' \ 'dataset/resource' % filename all_resource_ids_to_delete = defaultdict(list) # dataset_name: res_id_list dataset_names_to_delete = set() for dataset in datasets: org_id = dataset['owner_org'] # save csv as it has been save_csv_rows(csv_out_rows, dataset, None, None) original_dataset = copy.deepcopy(dataset) delete_dataset = False dataset_to_merge_to = \ get_dataset_to_merge_to(dataset, revamped_datasets_by_org) # detect dates for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue stats = timeseries_convert.add_date_to_resource( res, dataset=dataset) # resource corrections resources_to_delete = [] for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue resource_corrections(res, dataset, extras, revamped_resources, revamped_datasets_by_org, dataset_to_merge_to, org_id, resources_to_delete, stats_res) for res in resources_to_delete: dataset['resources'].remove(res) if not dataset['resources']: delete_dataset = True elif resources_to_delete and not dataset_to_merge_to: all_resource_ids_to_delete[dataset['name']].extend( res['id'] for res in resources_to_delete) org_id = dataset['owner_org'] # it might have changed for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue if res.get('resource_type') != 'documentation' and not res.get('date'): stats_dates.add('Missing date', dataset['name']) break else: stats_dates.add('Ok dates', dataset['name']) # record changes if delete_dataset: stats_datasets.add('Delete dataset - no resources', dataset['name']) dataset_names_to_delete.add(dataset['name']) continue elif original_dataset != dataset: stats_datasets.add('Updated dataset', dataset['name']) has_changed = True else: stats_datasets.add('Unchanged dataset', dataset['name']) has_changed = False if dataset_to_merge_to: stats_merge.add('Merge', dataset_to_merge_to) else: stats_merge.add('No merge', dataset['name']) # save csv with corrections save_csv_rows(csv_corrected_rows, dataset, has_changed, dataset_to_merge_to) except: traceback.print_exc() import pdb; pdb.set_trace() stats_merge.report_value_limit = 500 stats_res.report_value_limit = 500 print '\nDatasets\n', stats_datasets print '\nDataset merges\n', stats_merge print '\nDates\n', stats_dates print '\nResources\n', stats_res # save csvs if dataset_filter or res_url_filter: for row in csv_corrected_rows: if res_url_filter and row['res_url'] != res_url_filter: continue pprint(row) print 'Not written csv because you specified a particular dataset' else: headers = [ 'name', 'org_title', 'org_id', 'notes', 'res_id', 'res_name', 'res_url', 'res_format', 'res_date', 'res_type', 'has_changed', 'merge_to_dataset', ] for csv_rows, out_filename in ( (csv_out_rows, 'organogram_legacy_datasets.csv'), (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'), ): with open(out_filename, 'wb') as csv_write_file: csv_writer = unicodecsv.DictWriter(csv_write_file, fieldnames=headers, encoding='utf-8') csv_writer.writeheader() for row in sorted(csv_rows, key=lambda r: r['res_url']): csv_writer.writerow(row) print 'Written', out_filename # group merges by the revamped_dataset resources_to_merge = defaultdict(list) # revamped_dataset_name: resource_list resources_to_update = defaultdict(list) # dataset_name: resource_list for row in csv_corrected_rows: if row['has_changed'] is False: continue res = dict( id=row['res_id'], description=row['res_name'], # description is required url=row['res_url'], format=row['res_format'], date=row['res_date'], resource_type=row['res_type']) if row['merge_to_dataset']: res['id'] = None # ignore the id resources_to_merge[row['merge_to_dataset']].append(res) # also delete the merged dataset if row['name'] not in dataset_names_to_delete: dataset_names_to_delete.add(row['name']) else: resources_to_update[row['name']].append(res) # write changes - merges etc try: if destination: if write: write_caveat = '' else: write_caveat = ' (NOP without --write)' print 'Writing changes to datasets' + write_caveat stats_write_res = Stats() stats_write_dataset = Stats() ckan = common.get_ckanapi(destination) import ckanapi print 'Updating datasets' for dataset_name, res_list in resources_to_update.iteritems(): dataset = ckan.action.package_show(id=dataset_name) resources_by_id = dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (dataset_name, res_list.index(res)) res_to_update = resources_by_id.get(res['id']) if res_to_update: res_changed = False for key in res.keys(): if res[key] != res_to_update.get(key): res_to_update[key] = res[key] dataset_changed = True res_changed = True if res_changed: stats_write_res.add( 'update - ok' + write_caveat, res_ref) else: stats_write_res.add( 'update - not needed', res_ref) else: stats_write_res.add( 'update - could not find resource id', dataset_name) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Update done' + write_caveat, dataset_name) else: stats_write_dataset.add( 'Update not needed', dataset_name) print 'Merging datasets' for revamped_dataset_name, res_list in \ resources_to_merge.iteritems(): try: dataset = ckan.action.package_show(id=revamped_dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Merge - dataset not found', revamped_dataset_name) continue existing_res_urls = set(r['url'] for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (revamped_dataset_name, res_list.index(res)) if res['url'] in existing_res_urls: stats_write_res.add( 'merge - no change - resource URL already there', res_ref) else: dataset_changed = True res['description'] += ' (from legacy dataset)' dataset['resources'].append(res) stats_write_res.add( 'merge - add' + write_caveat, res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Merge done' + write_caveat, revamped_dataset_name) else: stats_write_dataset.add('Merge not needed', revamped_dataset_name) print 'Deleting resources' for dataset_name, res_id_list in \ all_resource_ids_to_delete.iteritems(): if dataset_name in dataset_names_to_delete: stats_write_dataset.add( 'Delete resources not needed as deleting dataset later', dataset_name) continue try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Delete res - dataset not found', dataset_name) continue existing_resources = \ dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res_id in res_id_list: res_ref = '%s-%s' % (dataset_name, res_id_list.index(res_id)) existing_resource = existing_resources.get(res_id) if existing_resource: dataset_changed = True dataset['resources'].remove(existing_resource) stats_write_res.add( 'delete res - done' + write_caveat, res_ref) else: stats_write_res.add( 'delete res - could not find res id', res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Delete res done' + write_caveat, dataset_name) else: stats_write_dataset.add( 'Delete res not needed', dataset_name) print 'Deleting datasets' for dataset_name in dataset_names_to_delete: try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Delete dataset - not found', dataset_name) else: if write: ckan.action.package_delete(id=dataset_name) stats_write_dataset.add( 'Delete dataset - done' + write_caveat, dataset_name) print '\nResources\n', stats_write_res print '\nDatasets\n', stats_write_dataset else: print 'Not written changes to datasets' except: traceback.print_exc() import pdb; pdb.set_trace()
pkg['tags'] = newtags if options.write: try: self.ckan.action.package_update(**pkg) ds_stats.add('Dataset updated', pkg['name']) except ValidationError, ve: print ds_stats.add('Validation error on update', pkg['name']) print ve except IntegrityError: print ds_stats.add('Integrity error on update', pkg['name']) else: ds_stats.add('Dataset would be updated', pkg['name']) else: ds_stats.add('No change', pkg['name']) print '\nResources:\n', res_stats.report(show_time_taken=True) print '\nDatasets:\n', ds_stats.report(show_time_taken=True) usage = __doc__ + ''' Usage: python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w''' if __name__ == '__main__': parser = OptionParser(usage=usage) parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-o', '--organization', dest='organization') parser.add_option("-w", "--write", action="store_true", dest="write",
dataset['tags'] = newtags if options.write: try: self.ckan.action.package_update(**dataset) print stats.add('Dataset updated', dataset['name']) except ValidationError, ve: print stats.add('Validation error on update', dataset['name']) print ve except IntegrityError: print stats.add('Integrity error on update', dataset['name']) else: stats.add('Dataset would be updated', dataset['name']) else: stats.add('No change', dataset['name']) print '\nDatasets:\n', stats.report(show_time_taken=True) def get_tidied_dataset(self, dataset): is_dataset_updated = False license_id = dataset['license_id'] or '' extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) licence = extras.get('licence') or '' if licence: # INSPIRE datasets are a python list repr'd # ast.literal_eval() is safer than eval() try: licence_bits = ast.literal_eval(licence) or [] is_dataset_updated = True
def main(source, source_type, destination, save_relevant_datasets_json, write, dataset_filter=None, res_url_filter=None): if source_type == 'json': all_datasets = get_datasets_from_json(source) elif source_type == 'jsonl': all_datasets = get_datasets_from_jsonl(source) else: all_datasets = get_datasets_from_ckan(source) datasets = [] # legacy ones revamped_datasets = [] # ones created on 3rd October 2016 launch revamped_datasets_by_org = {} revamped_resources = {} csv_out_rows = [] csv_corrected_rows = [] try: # find all the legacy organogram datasets all_datasets = list(all_datasets) # since we need to iterate it twice for dataset in all_datasets: if dataset_filter and dataset['name'] != dataset_filter: continue if res_url_filter and \ res_url_filter not in [r['url'] for r in dataset['resources']]: continue # check it an organogram dataset dataset_str = repr(dataset).lower() if 'rganog' not in dataset_str \ and 'roles and salaries' not in dataset_str \ and 'pay and post' not in dataset_str \ and 'posts and pay' not in dataset_str \ and 'organisation chart' not in dataset_str \ and 'organization chart' not in dataset_str \ and 'org chart' not in dataset_str: stats_datasets.add('Ignored - not organograms', dataset['name']) continue if dataset['name'] in ( 'eastbourne-borough-council-public-toilets', 'staff-organograms-and-pay-government-offices', ) \ or dataset['id'] in ( '47f69ebb-9939-419f-880d-1b976676cb0e', ): stats_datasets.add('Ignored - not organograms', dataset['name']) continue if asbool(dataset.get('unpublished')): stats_datasets.add('Ignored - unpublished', dataset['name']) continue extras = dict( (extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') == 'organograms_v2': continue if extras.get('import_source') == 'harvest': stats_datasets.add('Ignored - harvested so can\'t edit it', dataset['name']) continue # legacy dataset datasets.append(dataset) # find the revamped organogram datasets for dataset in all_datasets: extras = dict( (extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') != 'organograms_v2': continue org_id = dataset['owner_org'] revamped_datasets.append(dataset) assert org_id not in revamped_datasets_by_org, org_id revamped_datasets_by_org[org_id] = dataset for res in dataset['resources']: date = date_to_year_month(res['date']) revamped_resources[(org_id, date)] = res continue if save_relevant_datasets_json: filename = 'datasets_organograms.json' if not (dataset_filter or res_url_filter): output = json.dumps( datasets + revamped_datasets, indent=4, separators=(',', ': '), # pretty print) ) with open(filename, 'wb') as f: f.write(output) print 'Written %s' % filename else: print 'Not written %s because you filtered by a ' \ 'dataset/resource' % filename all_resource_ids_to_delete = defaultdict( list) # dataset_name: res_id_list dataset_names_to_delete = set() for dataset in datasets: org_id = dataset['owner_org'] # save csv as it has been save_csv_rows(csv_out_rows, dataset, None, None) original_dataset = copy.deepcopy(dataset) delete_dataset = False dataset_to_merge_to = \ get_dataset_to_merge_to(dataset, revamped_datasets_by_org) # detect dates for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue stats = timeseries_convert.add_date_to_resource( res, dataset=dataset) # resource corrections resources_to_delete = [] for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue resource_corrections(res, dataset, extras, revamped_resources, revamped_datasets_by_org, dataset_to_merge_to, org_id, resources_to_delete, stats_res) for res in resources_to_delete: dataset['resources'].remove(res) if not dataset['resources']: delete_dataset = True elif resources_to_delete and not dataset_to_merge_to: all_resource_ids_to_delete[dataset['name']].extend( res['id'] for res in resources_to_delete) org_id = dataset['owner_org'] # it might have changed for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue if res.get('resource_type') != 'documentation' and not res.get( 'date'): stats_dates.add('Missing date', dataset['name']) break else: stats_dates.add('Ok dates', dataset['name']) # record changes if delete_dataset: stats_datasets.add('Delete dataset - no resources', dataset['name']) dataset_names_to_delete.add(dataset['name']) continue elif original_dataset != dataset: stats_datasets.add('Updated dataset', dataset['name']) has_changed = True else: stats_datasets.add('Unchanged dataset', dataset['name']) has_changed = False if dataset_to_merge_to: stats_merge.add('Merge', dataset_to_merge_to) else: stats_merge.add('No merge', dataset['name']) # save csv with corrections save_csv_rows(csv_corrected_rows, dataset, has_changed, dataset_to_merge_to) except: traceback.print_exc() import pdb pdb.set_trace() stats_merge.report_value_limit = 500 stats_res.report_value_limit = 500 print '\nDatasets\n', stats_datasets print '\nDataset merges\n', stats_merge print '\nDates\n', stats_dates print '\nResources\n', stats_res # save csvs if dataset_filter or res_url_filter: for row in csv_corrected_rows: if res_url_filter and row['res_url'] != res_url_filter: continue pprint(row) print 'Not written csv because you specified a particular dataset' else: headers = [ 'name', 'org_title', 'org_id', 'notes', 'res_id', 'res_name', 'res_url', 'res_format', 'res_date', 'res_type', 'has_changed', 'merge_to_dataset', ] for csv_rows, out_filename in ( (csv_out_rows, 'organogram_legacy_datasets.csv'), (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'), ): with open(out_filename, 'wb') as csv_write_file: csv_writer = unicodecsv.DictWriter(csv_write_file, fieldnames=headers, encoding='utf-8') csv_writer.writeheader() for row in sorted(csv_rows, key=lambda r: r['res_url']): csv_writer.writerow(row) print 'Written', out_filename # group merges by the revamped_dataset resources_to_merge = defaultdict( list) # revamped_dataset_name: resource_list resources_to_update = defaultdict(list) # dataset_name: resource_list for row in csv_corrected_rows: if row['has_changed'] is False: continue res = dict( id=row['res_id'], description=row['res_name'], # description is required url=row['res_url'], format=row['res_format'], date=row['res_date'], resource_type=row['res_type']) if row['merge_to_dataset']: res['id'] = None # ignore the id resources_to_merge[row['merge_to_dataset']].append(res) # also delete the merged dataset if row['name'] not in dataset_names_to_delete: dataset_names_to_delete.add(row['name']) else: resources_to_update[row['name']].append(res) # write changes - merges etc try: if destination: if write: write_caveat = '' else: write_caveat = ' (NOP without --write)' print 'Writing changes to datasets' + write_caveat stats_write_res = Stats() stats_write_dataset = Stats() ckan = common.get_ckanapi(destination) import ckanapi print 'Updating datasets' for dataset_name, res_list in resources_to_update.iteritems(): dataset = ckan.action.package_show(id=dataset_name) resources_by_id = dict( (r['id'], r) for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (dataset_name, res_list.index(res)) res_to_update = resources_by_id.get(res['id']) if res_to_update: res_changed = False for key in res.keys(): if res[key] != res_to_update.get(key): res_to_update[key] = res[key] dataset_changed = True res_changed = True if res_changed: stats_write_res.add('update - ok' + write_caveat, res_ref) else: stats_write_res.add('update - not needed', res_ref) else: stats_write_res.add( 'update - could not find resource id', dataset_name) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Update done' + write_caveat, dataset_name) else: stats_write_dataset.add('Update not needed', dataset_name) print 'Merging datasets' for revamped_dataset_name, res_list in \ resources_to_merge.iteritems(): try: dataset = ckan.action.package_show( id=revamped_dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Merge - dataset not found', revamped_dataset_name) continue existing_res_urls = set(r['url'] for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (revamped_dataset_name, res_list.index(res)) if res['url'] in existing_res_urls: stats_write_res.add( 'merge - no change - resource URL already there', res_ref) else: dataset_changed = True res['description'] += ' (from legacy dataset)' dataset['resources'].append(res) stats_write_res.add('merge - add' + write_caveat, res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Merge done' + write_caveat, revamped_dataset_name) else: stats_write_dataset.add('Merge not needed', revamped_dataset_name) print 'Deleting resources' for dataset_name, res_id_list in \ all_resource_ids_to_delete.iteritems(): if dataset_name in dataset_names_to_delete: stats_write_dataset.add( 'Delete resources not needed as deleting dataset later', dataset_name) continue try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Delete res - dataset not found', dataset_name) continue existing_resources = \ dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res_id in res_id_list: res_ref = '%s-%s' % (dataset_name, res_id_list.index(res_id)) existing_resource = existing_resources.get(res_id) if existing_resource: dataset_changed = True dataset['resources'].remove(existing_resource) stats_write_res.add('delete res - done' + write_caveat, res_ref) else: stats_write_res.add( 'delete res - could not find res id', res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Delete res done' + write_caveat, dataset_name) else: stats_write_dataset.add('Delete res not needed', dataset_name) print 'Deleting datasets' for dataset_name in dataset_names_to_delete: try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Delete dataset - not found', dataset_name) else: if write: ckan.action.package_delete(id=dataset_name) stats_write_dataset.add( 'Delete dataset - done' + write_caveat, dataset_name) print '\nResources\n', stats_write_res print '\nDatasets\n', stats_write_dataset else: print 'Not written changes to datasets' except: traceback.print_exc() import pdb pdb.set_trace()
def command(cls, config_ini, options, submissions_csv_filepath): # Inventive CSV. Columns: # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated with open(submissions_csv_filepath, 'rb') as f: csv = UnicodeCsvReader(f, encoding='iso-8859-1') header = csv.next() header = [col_name.strip().lower().replace(' ', '_') for col_name in header] Submission = namedtuple('Submission', header) submissions = [Submission(*row) for row in csv] if config_ini: # this is only for when running from the command-line #print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() #print '...done' from ckan import model from ckan.plugins import toolkit from ckanext.dgu.lib import helpers as dgu_helpers from ckanext.dgu.model.schema_codelist import Schema log = __import__('logging').getLogger(__name__) # Match the organizations in the submissions lga_orgs_by_dgu_org_name = {} accepted_submission_dgu_orgs = set() for submission in submissions: la_title = la_map.get(submission.laname, submission.laname) org = model.Session.query(model.Group) \ .filter_by(title=la_title) \ .first() assert org, 'Submission org title not found: %r' % la_title lga_orgs_by_dgu_org_name[org.name] = submission.laname if submission.acceptancestatus == 'Accepted': accepted_submission_dgu_orgs.add(org.name) stats = Stats() stats_incentive = Stats() results = [] if options.write: rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ # Iterate over organizations if options.dataset: dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset}) org_names = [dataset['organization']['name']] elif options.organization: org_names = [options.organization] elif options.incentive_only: org_names = sorted(accepted_submission_dgu_orgs) else: org_names = dgu_helpers.all_la_org_names() #print '%s organizations' % len(org_names) for org_name in org_names: org_title = model.Group.by_name(org_name).title lga_org = lga_orgs_by_dgu_org_name.get(org_name) # Iterate over the schemas if options.schema: schema = all_schemas_by_dgu_name[options.schema] if options.incentive_only and not schema.lga_name: # not an incentive schema, so no results schemas = [] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.theme == schema.lga_name and submission.acceptancestatus == 'Accepted'] else: schemas = [all_schemas_by_lga_name.get( options.schema, schema)] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.acceptancestatus == 'Accepted'] else: schemas = all_schemas #print '%s schemas' % len(schemas) for schema in schemas: # Find the relevant incentive submission if lga_org: for submission in submissions: if submission.laname == lga_org and \ submission.theme == schema.lga_name: break else: submission = None else: submission = None result = dict( org_name=org_name, org_title=org_title, org_name_lga=submission.laname if submission else '', schema_dgu_title=schema.dgu_schema_name, schema_lga=schema.lga_name, lga_application_number=submission.applicationnumber if submission else '', lga_application_acceptance_status=submission.acceptancestatus if submission else '', dataset_names=[], dataset_titles=[], dataset_schema_applied=[], ) stat_id = '%s %s' % (org_name, schema.lga_name) if submission: stat_id += ' %s' % submission.applicationnumber def add_datasets_to_results(datasets, result): for dataset in datasets: if dataset['name'] not in result['dataset_names']: result['dataset_names'].append(dataset['name']) result['dataset_titles'].append(dataset['title']) schema_applied = True if schema.dgu_schema_name in \ [s['title'] for s in dataset.get('schema', [])] \ else False result['dataset_schema_applied'].append(schema_applied) if not schema_applied and options.write: pkg = model.Package.get(dataset['name']) schema_obj = Schema.by_title(schema.dgu_schema_name) assert schema_obj, schema.dgu_schema_name try: schema_ids = json.loads(pkg.extras.get('schema') or '[]') except ValueError: log.error('Not valid JSON in schema field: %s %r', dataset['name'], pkg.extras.get('schema')) schema_ids = [] schema_ids.append(schema_obj.id) pkg.extras['schema'] = json.dumps(schema_ids) # Already a schema? data_dict = {'fq': 'publisher:%s ' % org_name + 'schema_multi:"%s"' % schema.dgu_schema_name} datasets = toolkit.get_action('package_search')(data_dict=data_dict) if datasets['count'] > 0: add_datasets_to_results(datasets['results'], result) stats.add('OK - Dataset with schema', stat_id + ' %s' % ';'.join(result['dataset_names'])) found_schema = True else: found_schema = False # Submission specifies DGU dataset if submission and submission.dguurl: match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl) if match: dataset_name = dataset_name_original = match.groups()[0] # some have trailing / dataset_name = dataset_name.strip('/') # hampshire have a hash appended if '#' in dataset_name: dataset_name = dataset_name.split('#')[0] # poole have a resource name appended if '/resource' in dataset_name: dataset_name = dataset_name.split('/resource')[0] # manual corrections if dataset_name in dataset_name_corrections: dataset_name = dataset_name_corrections[dataset_name] dataset = model.Package.by_name(dataset_name) # salford ones added a '1' if not dataset: dataset = model.Package.by_name(dataset_name + '1') if dataset: dataset_name += '1' if dataset and dataset.state == 'active': dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id}) add_datasets_to_results([dataset_dict], result) if dataset_name != dataset_name_original: stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out', stat_id + ' %s' % dataset_name) else: stats_incentive.add('OK - DGU Dataset listed and it checks out', stat_id + ' %s' % dataset_name) elif dataset: stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format', '%s %s' % (stat_id, submission.dguurl)) # Submission mentions dataset on LA site - maybe it is in DGU already? elif submission and submission.dataseturl: datasets = model.Session.query(model.Package) \ .join(model.ResourceGroup) \ .join(model.Resource) \ .filter(model.Resource.url==submission.dataseturl) \ .filter(model.Package.state=='active') \ .filter(model.Resource.state=='active') \ .all() dataset_dicts = [ toolkit.get_action('package_show')(data_dict={'id': dataset.id}) for dataset in datasets] add_datasets_to_results(dataset_dicts, result) if len(datasets) > 1: stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets', '%s %s' % (stat_id, datasets[0].name)) elif len(datasets) == 0: stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU', stat_id) else: stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset', '%s %s' % (stat_id, datasets[0].name)) # Search for datasets in the catalogue datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name) if datasets is None: if not found_schema: stats.add('Search revealed none', stat_id) elif len(datasets) > 1: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets])) elif datasets: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name'])) else: if not found_schema: stats.add('No dataset for submission', stat_id) results.append(result) rows_with_datasets_count = \ len([result for result in results if any(result['dataset_schema_applied'])]) rows_with_datasets_or_candidate_datasets_count = \ len([result for result in results if result['dataset_schema_applied']]) if options.print_: print '\n Incentive stats\n' + stats_incentive.report() print '\n Overall stats\n' + stats.report() if options.write: print 'Writing' model.Session.commit() return {'table': results, 'rows_with_datasets_count': rows_with_datasets_count, 'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}