def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'fix_contact_details.py' for package in model.Session.query( model.Package).filter_by(state='active'): group = package.get_organization() if not group: stats.add('was not in a group', package.name) continue if package.extras.get('contact-name') == group.extras.get( 'contact-name'): if package_is_effected(package, group): if write: package.extras['contact-name'] = '' package.extras['contact-email'] = '' package.extras['contact-phone'] = '' package.extras['foi-name'] = '' package.extras['foi-email'] = '' package.extras['foi-web'] = '' package.extras['foi-phone'] = '' stats.add('resetting', 'Resetting package %s' % package.name) print stats.report() if write: model.Session.commit()
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = "script-fix_mandate.py" for package in model.Session.query(model.Package).filter(model.Package.state == "active"): if "mandate" in package.extras: mandate = package.extras.get("mandate") try: mandate = json.loads(mandate) if isinstance(mandate, list): stats.add("Already list", package.name) elif isinstance(mandate, basestring): stats.add("Fixing JSON string", package.name) package.extras["mandate"] = json.dumps([mandate]) else: stats.add("Problem JSON", package.name) except ValueError: if mandate != "": stats.add("Fixing string", package.name) package.extras["mandate"] = json.dumps([mandate]) else: stats.add("Deleting empty string", package.name) del package.extras["mandate"] else: stats.add("No mandate field", package.name) print stats.report() if write: print "Writing" model.Session.commit()
def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [ toolkit.get_action('organization_show')(data_dict={ 'id': org_name }) for org_name in org_names ] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [toolkit.get_action('organization_show')( data_dict={'id': org_name}) for org_name in org_names] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'fix_contact_details.py' for package in model.Session.query(model.Package).filter_by(state='active'): group = package.get_organization() if not group: stats.add('was not in a group', package.name) continue if package.extras.get('contact-name') == group.extras.get('contact-name'): if package_is_effected(package, group): if write: package.extras['contact-name'] = '' package.extras['contact-email'] = '' package.extras['contact-phone'] = '' package.extras['foi-name'] = '' package.extras['foi-email'] = '' package.extras['foi-web'] = '' package.extras['foi-phone'] = '' stats.add('resetting', 'Resetting package %s' % package.name) print stats.report() if write: model.Session.commit()
def command(cls, config_ini, write, options): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'script-delete_cache_filepath.py' process_all = True if options.resource: cls.process_resource(model.Resource.get(options.resource)) process_all = False else: # Get each dataset, counter = 0 datasets_q = model.Session.query(model.Package) \ .filter_by(state='active') rounded = int(math.ceil(datasets_q.count() / 100.0)) * 100 for x in xrange(0, rounded, 100): datasets = datasets_q.offset(x).limit(100) updated = False for dataset in datasets.all(): counter += 1 print "Processing dataset %d\r" % counter, for resource in dataset.resources: if cls.process_resource(resource): updated = True for key in dataset_properties_to_make_null: if getattr(dataset, key): stats_dp.add('Making property null: %s' % key, dataset.name) setattr(dataset, key, None) updated = True else: stats_dp.add('Property has no value: %s' % key, dataset.name) for key in dataset_extras_to_remove: if key in dataset.extras: #stats_de.add('Removing: %s' % key, dataset.name) del dataset.extras[key] updated = True else: stats_de.add('No field to remove: %s' % key, dataset.name) # We will be committing 100 at a time if updated and write: print "\nCommitting changes" import time s = time.time() model.Session.commit() print "Committed in ", time.time() - s print 'Resource Properties:\n', stats_rp.report(show_time_taken=False) print 'Resource Extras:\n', stats_re.report() print 'Dataset Properties:\n', stats_dp.report(show_time_taken=False) print 'Dataset Extras:\n', stats_de.report()
def get_datasets_from_ckan(domain): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://%s' % domain, apikey=apikey) datasets = ckan.action.package_search(q='organogram', rows=400) return datasets
def command(cls, config_ini): common.load_config(config_ini) common.register_translator() from ckanext.dgu.model.feedback import Feedback comment_hashes = [] headers = ["user_id", "package_id", "timestamp", "title", "comment"] writer = csv.DictWriter(sys.stdout, headers) for fb in model.Session.query(Feedback)\ .filter(Feedback.visible==True)\ .filter(Feedback.active==True)\ .order_by(Feedback.created): if not any( [fb.economic, fb.social, fb.effective, fb.linked, fb.other]): stats.add('Missing any content', fb.id) continue user = model.User.get(fb.user_id) pkg = model.Package.get(fb.package_id) data = { u"timestamp": fb.created.isoformat(), u"package": pkg.name, u"item": fb } content = render_template(TEMPLATE, data) comment = content.replace(u'\r', u'').replace(u'\n', u'').replace( u' ', u'') # Check for identical comments ... we want users duplicating comments on # the same package (by mistake most often). hashkey = u'{}.{}.{}'.format(comment, fb.package_id, fb.user_id).encode('utf8', 'ignore') comment_hash = hashlib.md5(hashkey).hexdigest() if comment_hash in comment_hashes: stats.add('Duplicate post', fb.id) continue comment_hashes.append(comment_hash) row = { u"user_id": user.name[len("user_d"):], u"package_id": pkg.name, u"timestamp": fb.created.isoformat(), u"title": "Feedback on the value of this dataset ", u"comment": comment.encode('utf-8', 'ignore') } writer.writerow(row) stats.add('Processed', fb.id)
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() def new_revision(): rev = model.repo.new_revision() rev.author = 'script_delete_duplicate_datasets.py' if write: new_revision() publisher = model.Group.get(options.publisher) if publisher is None: print "Publisher could not be found" sys.exit(0) guids = defaultdict(list) for package in publisher.packages(): guids[package.extras.get('guid')].append(package) for guid, packages in guids.items(): if guid is None: for package in packages: stats.add('Skip package not harvested', package.name) continue if len(packages) == 1: stats.add('Skip guid without duplicates', guid) continue best_name = None for i, package in enumerate(sorted(packages, key=lambda x: x.metadata_modified, reverse=options.keep_last)): if (not best_name or len(package.name) < len(best_name) or (len(package.name) == len(best_name) and package.name < best_name)): best_name = package.name if i == 0: kept_package = package else: stats.add('Deleting', package.name) package.name = package.name + '_' package.state = 'deleted' # Write the name changes, so that we can reuse the best_name. stats.add('Keep', '%s->%s' % (kept_package.name, best_name)) if write: model.Session.commit() new_revision() kept_package.name = best_name if write: model.Session.commit() print stats.report()
def command(cls, config_ini): common.load_config(config_ini) common.register_translator() from ckanext.dgu.model.feedback import Feedback comment_hashes = [] headers = ["user_id", "package_id", "timestamp", "title", "comment"] writer = csv.DictWriter(sys.stdout, headers) for fb in model.Session.query(Feedback)\ .filter(Feedback.visible==True)\ .filter(Feedback.active==True)\ .order_by(Feedback.created): if not any([fb.economic, fb.social, fb.effective, fb.linked, fb.other]): stats.add('Missing any content', fb.id ) continue user = model.User.get(fb.user_id) pkg = model.Package.get(fb.package_id) data = { u"timestamp": fb.created.isoformat(), u"package": pkg.name, u"item": fb } content = render_template(TEMPLATE, data) comment = content.replace(u'\r',u'').replace(u'\n',u'').replace(u' ', u'') # Check for identical comments ... we want users duplicating comments on # the same package (by mistake most often). hashkey = u'{}.{}.{}'.format(comment, fb.package_id, fb.user_id).encode('utf8', 'ignore') comment_hash = hashlib.md5(hashkey).hexdigest() if comment_hash in comment_hashes: stats.add('Duplicate post', fb.id ) continue comment_hashes.append(comment_hash) row = { u"user_id": user.name[len("user_d"):], u"package_id": pkg.name, u"timestamp": fb.created.isoformat(), u"title": "Feedback on the value of this dataset ", u"comment": comment.encode('utf-8', 'ignore') } writer.writerow(row) stats.add('Processed', fb.id )
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'fix_secondary_theme.py' for package in model.Session.query(model.Package): if 'theme-secondary' in package.extras: stats.add('Fixing', package.name) secondary_theme = package.extras.get('theme-secondary') if secondary_theme.startswith('['): theme_list = ast.literal_eval(secondary_theme) package.extras['theme-secondary'] = json.dumps(theme_list) else: package.extras['theme-secondary'] = json.dumps( secondary_theme) if write: model.Session.commit() print stats.report()
def command(cls, config_ini, options, submissions_csv_filepath): # Inventive CSV. Columns: # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated with open(submissions_csv_filepath, 'rb') as f: csv = UnicodeCsvReader(f, encoding='iso-8859-1') header = csv.next() header = [col_name.strip().lower().replace(' ', '_') for col_name in header] Submission = namedtuple('Submission', header) submissions = [Submission(*row) for row in csv] if config_ini: # this is only for when running from the command-line #print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() #print '...done' from ckan import model from ckan.plugins import toolkit from ckanext.dgu.lib import helpers as dgu_helpers from ckanext.dgu.model.schema_codelist import Schema log = __import__('logging').getLogger(__name__) # Match the organizations in the submissions lga_orgs_by_dgu_org_name = {} accepted_submission_dgu_orgs = set() for submission in submissions: la_title = la_map.get(submission.laname, submission.laname) org = model.Session.query(model.Group) \ .filter_by(title=la_title) \ .first() assert org, 'Submission org title not found: %r' % la_title lga_orgs_by_dgu_org_name[org.name] = submission.laname if submission.acceptancestatus == 'Accepted': accepted_submission_dgu_orgs.add(org.name) stats = Stats() stats_incentive = Stats() results = [] if options.write: rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ # Iterate over organizations if options.dataset: dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset}) org_names = [dataset['organization']['name']] elif options.organization: org_names = [options.organization] elif options.incentive_only: org_names = sorted(accepted_submission_dgu_orgs) else: org_names = dgu_helpers.all_la_org_names() #print '%s organizations' % len(org_names) for org_name in org_names: org_title = model.Group.by_name(org_name).title lga_org = lga_orgs_by_dgu_org_name.get(org_name) # Iterate over the schemas if options.schema: schema = all_schemas_by_dgu_name[options.schema] if options.incentive_only and not schema.lga_name: # not an incentive schema, so no results schemas = [] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.theme == schema.lga_name and submission.acceptancestatus == 'Accepted'] else: schemas = [all_schemas_by_lga_name.get( options.schema, schema)] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.acceptancestatus == 'Accepted'] else: schemas = all_schemas #print '%s schemas' % len(schemas) for schema in schemas: # Find the relevant incentive submission if lga_org: for submission in submissions: if submission.laname == lga_org and \ submission.theme == schema.lga_name: break else: submission = None else: submission = None result = dict( org_name=org_name, org_title=org_title, org_name_lga=submission.laname if submission else '', schema_dgu_title=schema.dgu_schema_name, schema_lga=schema.lga_name, lga_application_number=submission.applicationnumber if submission else '', lga_application_acceptance_status=submission.acceptancestatus if submission else '', dataset_names=[], dataset_titles=[], dataset_schema_applied=[], ) stat_id = '%s %s' % (org_name, schema.lga_name) if submission: stat_id += ' %s' % submission.applicationnumber def add_datasets_to_results(datasets, result): for dataset in datasets: if dataset['name'] not in result['dataset_names']: result['dataset_names'].append(dataset['name']) result['dataset_titles'].append(dataset['title']) schema_applied = True if schema.dgu_schema_name in \ [s['title'] for s in dataset.get('schema', [])] \ else False result['dataset_schema_applied'].append(schema_applied) if not schema_applied and options.write: pkg = model.Package.get(dataset['name']) schema_obj = Schema.by_title(schema.dgu_schema_name) assert schema_obj, schema.dgu_schema_name try: schema_ids = json.loads(pkg.extras.get('schema') or '[]') except ValueError: log.error('Not valid JSON in schema field: %s %r', dataset['name'], pkg.extras.get('schema')) schema_ids = [] schema_ids.append(schema_obj.id) pkg.extras['schema'] = json.dumps(schema_ids) # Already a schema? data_dict = {'fq': 'publisher:%s ' % org_name + 'schema_multi:"%s"' % schema.dgu_schema_name} datasets = toolkit.get_action('package_search')(data_dict=data_dict) if datasets['count'] > 0: add_datasets_to_results(datasets['results'], result) stats.add('OK - Dataset with schema', stat_id + ' %s' % ';'.join(result['dataset_names'])) found_schema = True else: found_schema = False # Submission specifies DGU dataset if submission and submission.dguurl: match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl) if match: dataset_name = dataset_name_original = match.groups()[0] # some have trailing / dataset_name = dataset_name.strip('/') # hampshire have a hash appended if '#' in dataset_name: dataset_name = dataset_name.split('#')[0] # poole have a resource name appended if '/resource' in dataset_name: dataset_name = dataset_name.split('/resource')[0] # manual corrections if dataset_name in dataset_name_corrections: dataset_name = dataset_name_corrections[dataset_name] dataset = model.Package.by_name(dataset_name) # salford ones added a '1' if not dataset: dataset = model.Package.by_name(dataset_name + '1') if dataset: dataset_name += '1' if dataset and dataset.state == 'active': dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id}) add_datasets_to_results([dataset_dict], result) if dataset_name != dataset_name_original: stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out', stat_id + ' %s' % dataset_name) else: stats_incentive.add('OK - DGU Dataset listed and it checks out', stat_id + ' %s' % dataset_name) elif dataset: stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format', '%s %s' % (stat_id, submission.dguurl)) # Submission mentions dataset on LA site - maybe it is in DGU already? elif submission and submission.dataseturl: datasets = model.Session.query(model.Package) \ .join(model.ResourceGroup) \ .join(model.Resource) \ .filter(model.Resource.url==submission.dataseturl) \ .filter(model.Package.state=='active') \ .filter(model.Resource.state=='active') \ .all() dataset_dicts = [ toolkit.get_action('package_show')(data_dict={'id': dataset.id}) for dataset in datasets] add_datasets_to_results(dataset_dicts, result) if len(datasets) > 1: stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets', '%s %s' % (stat_id, datasets[0].name)) elif len(datasets) == 0: stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU', stat_id) else: stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset', '%s %s' % (stat_id, datasets[0].name)) # Search for datasets in the catalogue datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name) if datasets is None: if not found_schema: stats.add('Search revealed none', stat_id) elif len(datasets) > 1: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets])) elif datasets: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name'])) else: if not found_schema: stats.add('No dataset for submission', stat_id) results.append(result) rows_with_datasets_count = \ len([result for result in results if any(result['dataset_schema_applied'])]) rows_with_datasets_or_candidate_datasets_count = \ len([result for result in results if result['dataset_schema_applied']]) if options.print_: print '\n Incentive stats\n' + stats_incentive.report() print '\n Overall stats\n' + stats.report() if options.write: print 'Writing' model.Session.commit() return {'table': results, 'rows_with_datasets_count': rows_with_datasets_count, 'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}
log.info('----------------------------') TASKS_TO_RUN = ['analytics', 'openspending', 'dump', 'dump_analysis', 'backup'] if __name__ == '__main__': USAGE = '''Daily script for government Usage: python %s [config.ini] You may provide an optional argument at the end which is the tasks to run, and you can choose from %s or run multiple by separating by a comma. ''' % (sys.argv[0], ','.join(TASKS_TO_RUN)) if len(sys.argv) < 2 or sys.argv[1] in ('--help', '-h'): err = 'Error: Please specify config file.' print USAGE, err logging.error('%s\n%s' % (USAGE, err)) sys.exit(1) config_file = sys.argv[1] config_ini_filepath = os.path.abspath(config_file) if len(sys.argv) == 3: TASKS_TO_RUN = sys.argv[2].split(',') load_config(config_ini_filepath) register_translator() logging.config.fileConfig(config_ini_filepath) command(config_file)
if __name__ == '__main__': usage = """Tool to migrate QA data from TaskStatus to QA table usage: %prog [options] <ckan.ini> """ parser = OptionParser(usage=usage) parser.add_option("-w", "--write", action="store_true", dest="write", help="write the changes") parser.add_option('-p', '--publisher', dest='publisher') parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-r', '--resource', dest='resource') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Wrong number of arguments (%i)' % len(args)) config_ini = args[0] print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() print 'Done' # Setup logging to print debug out for local only rootLogger = logging.getLogger() rootLogger.setLevel(logging.WARNING) localLogger = logging.getLogger(__name__) localLogger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(message)s')) localLogger.addHandler(handler) migrate(options)
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend([dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date(resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add('Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation'] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % (i+1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % (field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id']) date_format = {'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY'} input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print [res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace()
def command(cls, config_ini, options): common.load_config(config_ini) common.register_translator() from ckan import model from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES) rev = model.repo.new_revision() rev.author = 'script-fix_themes.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset, organization_ref=options.organization) def fix_theme(theme_str): '''Returns (fixed_theme_str, outcome)''' if not theme_str: return '', 'Blank' elif theme_str == 'null': return '', '"null"->""' elif theme_str in THEMES: return theme_str, 'Ok' else: fixed_theme = THEME_MAP.get(theme_str) if fixed_theme is None: return theme_str, 'Unknown theme %s - recategorizing' % theme_str else: assert(fixed_theme != theme_str) return fixed_theme, 'Changed to long form' package.extras[PRIMARY_THEME] = new_primary def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]' for package in datasets: if PRIMARY_THEME in package.extras: primary = package.extras.get(PRIMARY_THEME) new_primary, outcome = fix_theme(primary) if new_primary != primary: package.extras[PRIMARY_THEME] = new_primary output = stats_primary.add(outcome, package.name) if outcome != 'Ok': print output if outcome.startswith('Unknown theme'): recategorize(package) continue else: stats_primary.add('No theme', package.name) if SECONDARY_THEMES in package.extras: secondary = package.extras.get(SECONDARY_THEMES) try: secondary = json.loads(secondary) except ValueError: if secondary.startswith('{') and secondary.endswith('}'): # '{Crime}' -> 'Crime' secondary = secondary[1:-1].strip('\"') print stats_secondary.add('Tidied {}', package.name) else: print stats_secondary.add('Error decoding JSON', package.name) if secondary == {}: secondary = [] new_secondary = [] do_recategorize = False if not isinstance(secondary, list): secondary = [secondary] for theme_str in secondary: if not isinstance(theme_str, basestring): print stats_secondary.add('Not a list of strings %s' % type(theme_str), package.name) continue new_theme, outcome = fix_theme(theme_str) if new_theme: new_secondary.append(new_theme) if outcome != 'Ok': print stats_secondary.add(outcome, package.name) if outcome.startswith('Unknown theme'): do_recategorize = True if do_recategorize: recategorize(package) continue if json.dumps(new_secondary) != package.extras.get(SECONDARY_THEMES): stats_secondary.add('Fixed', package.name) package.extras[SECONDARY_THEMES] = json.dumps(new_secondary) else: stats_secondary.add('Ok', package.name) else: stats_secondary.add('No theme', package.name) if 'themes-secondary' in package.extras: print stats_secondary.add('Old key removed: themes-secondary', '%s %s' % (package.name, package.extras['themes-secondary'])) del package.extras['themes-secondary'] print "\nPrimary theme:" print stats_primary.report() print "\nSecondary theme:" print stats_secondary.report() print "\nRecategorizations:" print stats_recategorize.report() if options.write: print 'Writing' model.Session.commit()
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend( [dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add( 'Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date( resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add( 'Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [ res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation' ] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % ( i + 1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % ( field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % ( res['dataset_name'], res['id']) date_format = { 'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY' } input_ = raw_input( 'Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print[res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace() # Remove 'dataset_name' and others fields from resources ignore_res_fields = set( ('dataset_name', 'created', 'position', 'revision_id', 'id', 'tracking_summary', 'qa', 'archiver')) for res in resources: for field in ignore_res_fields & set(res.keys()): del res[field] # Merge dataset fields def get_all_fields_and_values(datasets): ignore_fields = set(( 'id', 'resources', 'last_major_modification', 'data_dict', 'revision_timestamp', 'num_tags', 'metadata_created', 'metadata_modified', 'odi_certificate', 'extras', # they are at top level already 'timeseries_resources', 'individual_resources', 'additional_resources', 'revision_id', 'organization', 'tracking_summary', 'num_resources', 'license_title', 'author', 'author_email', 'maintainer', 'maintainer_email', 'temporal_granularity', 'geographic_granularity', 'state', 'isopen', 'url', 'date_update_future', 'date_updated', 'date_released', 'precision', 'taxonomy_url', 'temporal_coverage-from', 'temporal_coverage-to', 'published_via', 'creator_user_id', 'qa', 'archiver', )) first_fields = [ 'title', 'name', 'notes', 'theme-primary', 'theme-secondary' ] all_field_values = defaultdict(list) for dataset in datasets: for field in dataset: if field not in ignore_fields and dataset[field]: all_field_values[field].append(dataset[field]) for field in first_fields: yield field, all_field_values.get(field, []) for field in all_field_values: if field not in first_fields: yield field, all_field_values[field] spend_data_defaults = { 'geographic_coverage': None, 'theme-primary': 'Government Spending', 'theme-secondary': None, 'update_frequency': 'monthly', } combined_dataset = {'resources': resources} all_fields_and_values = get_all_fields_and_values(datasets) for field, values in all_fields_and_values: if field == 'notes': values = [value.strip() for value in values] if field == 'tags': # just merge them up-front and # dont offer user any choice tags_by_name = {} for dataset_tags in values: for tag in dataset_tags: if tag['name'] not in tags_by_name: tags_by_name[tag['name']] = tag values = [tags_by_name.values()] if field in ('codelist', 'schema'): # just merge them up-front # And convert the dict into just an id string ids = set() for dataset_values in values: for value_dict in dataset_values: ids.add(value_dict['id']) values = [list(ids)] print '\n%s:' % field pprint(list(enumerate(values))) if options.spend and field in spend_data_defaults: value = spend_data_defaults[field] print 'Spend data defaults to: %s' % value values = [value] if value is not None else None # dont be case-sensitive for boolean fields if field == 'core-dataset': values = [v.lower() for v in values] try: values_identicle = len(set(values)) == 1 except TypeError: if values and len(values): val1 = values[0] for val in values[1:]: if val != val1: values_identicle = False break else: values_identicle = True if (not values) or (not len(values)): pass elif values_identicle: value = values[0] elif field == 'name': while True: from ckan.lib.munge import munge_title_to_name munged_title = munge_title_to_name( combined_dataset['title']) print munge_title_to_name( datasets[0]['organization']['title']) value = raw_input('Type new value (%s): ' % (munged_title)) if not value: value = munged_title if len(value) < 3: print 'Too short' continue if value in values: print 'That name is taken' continue existing = ckan.action.package_autocomplete(q=value) if value in existing: print 'That name is taken on CKAN' continue break else: while True: response = raw_input( '%s: value (number) or type new one: ' % field) try: value_index = int(response) value = values[value_index] print value except ValueError: # fix pound signs if the user pasted from the repr'd version response = re.sub(r'\\xa3', u'\xa3', response) value = response if not value and field in ('title', 'owner_org', 'notes', 'license_id'): print 'You must have a value for this field!' continue break if value: combined_dataset[field] = value # Store print '\nMerged dataset:\n' pprint(combined_dataset) response = raw_input( 'Press enter to write or pdb to edit in pdb first: ') if response == 'pdb': import pdb pdb.set_trace() try: if options.update: ckan.action.dataset_update(**combined_dataset) else: ckan.action.dataset_create(**combined_dataset) except Exception, e: print e import pdb pdb.set_trace()
def command(cls, config_ini, options): common.load_config(config_ini) common.register_translator() from ckan import model from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES) rev = model.repo.new_revision() rev.author = 'script-fix_themes.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset, organization_ref=options.organization) def fix_theme(theme_str): '''Returns (fixed_theme_str, outcome)''' if not theme_str: return '', 'Blank' elif theme_str == 'null': return '', '"null"->""' elif theme_str in THEMES: return theme_str, 'Ok' else: fixed_theme = THEME_MAP.get(theme_str) if fixed_theme is None: return theme_str, 'Unknown theme %s - recategorizing' % theme_str else: assert (fixed_theme != theme_str) return fixed_theme, 'Changed to long form' package.extras[PRIMARY_THEME] = new_primary def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]' for package in datasets: if PRIMARY_THEME in package.extras: primary = package.extras.get(PRIMARY_THEME) new_primary, outcome = fix_theme(primary) if new_primary != primary: package.extras[PRIMARY_THEME] = new_primary output = stats_primary.add(outcome, package.name) if outcome != 'Ok': print output if outcome.startswith('Unknown theme'): recategorize(package) continue else: stats_primary.add('No theme', package.name) if SECONDARY_THEMES in package.extras: secondary = package.extras.get(SECONDARY_THEMES) try: secondary = json.loads(secondary) except ValueError: if secondary.startswith('{') and secondary.endswith('}'): # '{Crime}' -> 'Crime' secondary = secondary[1:-1].strip('\"') print stats_secondary.add('Tidied {}', package.name) else: print stats_secondary.add('Error decoding JSON', package.name) if secondary == {}: secondary = [] new_secondary = [] do_recategorize = False if not isinstance(secondary, list): secondary = [secondary] for theme_str in secondary: if not isinstance(theme_str, basestring): print stats_secondary.add( 'Not a list of strings %s' % type(theme_str), package.name) continue new_theme, outcome = fix_theme(theme_str) if new_theme: new_secondary.append(new_theme) if outcome != 'Ok': print stats_secondary.add(outcome, package.name) if outcome.startswith('Unknown theme'): do_recategorize = True if do_recategorize: recategorize(package) continue if json.dumps(new_secondary) != package.extras.get( SECONDARY_THEMES): stats_secondary.add('Fixed', package.name) package.extras[SECONDARY_THEMES] = json.dumps( new_secondary) else: stats_secondary.add('Ok', package.name) else: stats_secondary.add('No theme', package.name) if 'themes-secondary' in package.extras: print stats_secondary.add( 'Old key removed: themes-secondary', '%s %s' % (package.name, package.extras['themes-secondary'])) del package.extras['themes-secondary'] print "\nPrimary theme:" print stats_primary.report() print "\nSecondary theme:" print stats_secondary.report() print "\nRecategorizations:" print stats_recategorize.report() if options.write: print 'Writing' model.Session.commit()
if __name__ == '__main__': USAGE = '''Daily script for government Usage: python %s <config.ini> [task] Where: [task] - task to run (optional), picked from: %s or run multiple by separating by a comma. ''' % (sys.argv[0], ','.join(TASKS_TO_RUN)) if set(sys.argv) & set(('--help', '-h')): print USAGE sys.exit(1) if len(sys.argv) < 2: err = 'Error: Please specify config file.' print USAGE, err logging.error('%s' % err) sys.exit(1) config_file = sys.argv[1] config_ini_filepath = os.path.abspath(config_file) if len(sys.argv) == 3: TASKS_TO_RUN = sys.argv[2].split(',') load_config(config_ini_filepath) register_translator() logging.config.fileConfig(config_ini_filepath) command(config_file)