def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [toolkit.get_action('organization_show')( data_dict={'id': org_name}) for org_name in org_names] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [ toolkit.get_action('organization_show')(data_dict={ 'id': org_name }) for org_name in org_names ] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
if options.write: try: self.ckan.action.package_update(**pkg) ds_stats.add('Dataset updated', pkg['name']) except ValidationError, ve: print ds_stats.add('Validation error on update', pkg['name']) print ve except IntegrityError: print ds_stats.add('Integrity error on update', pkg['name']) else: ds_stats.add('Dataset would be updated', pkg['name']) else: ds_stats.add('No change', pkg['name']) print '\nResources:\n', res_stats.report(show_time_taken=True) print '\nDatasets:\n', ds_stats.report(show_time_taken=True) usage = __doc__ + ''' Usage: python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w''' if __name__ == '__main__': parser = OptionParser(usage=usage) parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-o', '--organization', dest='organization') parser.add_option("-w", "--write", action="store_true", dest="write",
if options.write: try: self.ckan.action.package_update(**pkg) ds_stats.add('Dataset updated', pkg['name']) except ValidationError, ve: print ds_stats.add('Validation error on update', pkg['name']) print ve except IntegrityError: print ds_stats.add('Integrity error on update', pkg['name']) else: ds_stats.add('Dataset would be updated', pkg['name']) else: ds_stats.add('No change', pkg['name']) print '\nResources:\n', res_stats.report(show_time_taken=True) print '\nDatasets:\n', ds_stats.report(show_time_taken=True) usage = __doc__ + ''' Usage: python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w''' if __name__ == '__main__': parser = OptionParser(usage=usage) parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-o', '--organization', dest='organization') parser.add_option("-w", "--write", action="store_true", dest="write", default=False,
if options.write: try: self.ckan.action.package_update(**dataset) print stats.add('Dataset updated', dataset['name']) except ValidationError, ve: print stats.add('Validation error on update', dataset['name']) print ve except IntegrityError: print stats.add('Integrity error on update', dataset['name']) else: stats.add('Dataset would be updated', dataset['name']) else: stats.add('No change', dataset['name']) print '\nDatasets:\n', stats.report(show_time_taken=True) def get_tidied_dataset(self, dataset): is_dataset_updated = False license_id = dataset['license_id'] or '' extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) licence = extras.get('licence') or '' if licence: # INSPIRE datasets are a python list repr'd # ast.literal_eval() is safer than eval() try: licence_bits = ast.literal_eval(licence) or [] is_dataset_updated = True except (ValueError, SyntaxError):
def command(cls, config_ini, options, submissions_csv_filepath): # Inventive CSV. Columns: # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated with open(submissions_csv_filepath, 'rb') as f: csv = UnicodeCsvReader(f, encoding='iso-8859-1') header = csv.next() header = [col_name.strip().lower().replace(' ', '_') for col_name in header] Submission = namedtuple('Submission', header) submissions = [Submission(*row) for row in csv] if config_ini: # this is only for when running from the command-line #print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() #print '...done' from ckan import model from ckan.plugins import toolkit from ckanext.dgu.lib import helpers as dgu_helpers from ckanext.dgu.model.schema_codelist import Schema log = __import__('logging').getLogger(__name__) # Match the organizations in the submissions lga_orgs_by_dgu_org_name = {} accepted_submission_dgu_orgs = set() for submission in submissions: la_title = la_map.get(submission.laname, submission.laname) org = model.Session.query(model.Group) \ .filter_by(title=la_title) \ .first() assert org, 'Submission org title not found: %r' % la_title lga_orgs_by_dgu_org_name[org.name] = submission.laname if submission.acceptancestatus == 'Accepted': accepted_submission_dgu_orgs.add(org.name) stats = Stats() stats_incentive = Stats() results = [] if options.write: rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ # Iterate over organizations if options.dataset: dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset}) org_names = [dataset['organization']['name']] elif options.organization: org_names = [options.organization] elif options.incentive_only: org_names = sorted(accepted_submission_dgu_orgs) else: org_names = dgu_helpers.all_la_org_names() #print '%s organizations' % len(org_names) for org_name in org_names: org_title = model.Group.by_name(org_name).title lga_org = lga_orgs_by_dgu_org_name.get(org_name) # Iterate over the schemas if options.schema: schema = all_schemas_by_dgu_name[options.schema] if options.incentive_only and not schema.lga_name: # not an incentive schema, so no results schemas = [] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.theme == schema.lga_name and submission.acceptancestatus == 'Accepted'] else: schemas = [all_schemas_by_lga_name.get( options.schema, schema)] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.acceptancestatus == 'Accepted'] else: schemas = all_schemas #print '%s schemas' % len(schemas) for schema in schemas: # Find the relevant incentive submission if lga_org: for submission in submissions: if submission.laname == lga_org and \ submission.theme == schema.lga_name: break else: submission = None else: submission = None result = dict( org_name=org_name, org_title=org_title, org_name_lga=submission.laname if submission else '', schema_dgu_title=schema.dgu_schema_name, schema_lga=schema.lga_name, lga_application_number=submission.applicationnumber if submission else '', lga_application_acceptance_status=submission.acceptancestatus if submission else '', dataset_names=[], dataset_titles=[], dataset_schema_applied=[], ) stat_id = '%s %s' % (org_name, schema.lga_name) if submission: stat_id += ' %s' % submission.applicationnumber def add_datasets_to_results(datasets, result): for dataset in datasets: if dataset['name'] not in result['dataset_names']: result['dataset_names'].append(dataset['name']) result['dataset_titles'].append(dataset['title']) schema_applied = True if schema.dgu_schema_name in \ [s['title'] for s in dataset.get('schema', [])] \ else False result['dataset_schema_applied'].append(schema_applied) if not schema_applied and options.write: pkg = model.Package.get(dataset['name']) schema_obj = Schema.by_title(schema.dgu_schema_name) assert schema_obj, schema.dgu_schema_name try: schema_ids = json.loads(pkg.extras.get('schema') or '[]') except ValueError: log.error('Not valid JSON in schema field: %s %r', dataset['name'], pkg.extras.get('schema')) schema_ids = [] schema_ids.append(schema_obj.id) pkg.extras['schema'] = json.dumps(schema_ids) # Already a schema? data_dict = {'fq': 'publisher:%s ' % org_name + 'schema_multi:"%s"' % schema.dgu_schema_name} datasets = toolkit.get_action('package_search')(data_dict=data_dict) if datasets['count'] > 0: add_datasets_to_results(datasets['results'], result) stats.add('OK - Dataset with schema', stat_id + ' %s' % ';'.join(result['dataset_names'])) found_schema = True else: found_schema = False # Submission specifies DGU dataset if submission and submission.dguurl: match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl) if match: dataset_name = dataset_name_original = match.groups()[0] # some have trailing / dataset_name = dataset_name.strip('/') # hampshire have a hash appended if '#' in dataset_name: dataset_name = dataset_name.split('#')[0] # poole have a resource name appended if '/resource' in dataset_name: dataset_name = dataset_name.split('/resource')[0] # manual corrections if dataset_name in dataset_name_corrections: dataset_name = dataset_name_corrections[dataset_name] dataset = model.Package.by_name(dataset_name) # salford ones added a '1' if not dataset: dataset = model.Package.by_name(dataset_name + '1') if dataset: dataset_name += '1' if dataset and dataset.state == 'active': dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id}) add_datasets_to_results([dataset_dict], result) if dataset_name != dataset_name_original: stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out', stat_id + ' %s' % dataset_name) else: stats_incentive.add('OK - DGU Dataset listed and it checks out', stat_id + ' %s' % dataset_name) elif dataset: stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format', '%s %s' % (stat_id, submission.dguurl)) # Submission mentions dataset on LA site - maybe it is in DGU already? elif submission and submission.dataseturl: datasets = model.Session.query(model.Package) \ .join(model.ResourceGroup) \ .join(model.Resource) \ .filter(model.Resource.url==submission.dataseturl) \ .filter(model.Package.state=='active') \ .filter(model.Resource.state=='active') \ .all() dataset_dicts = [ toolkit.get_action('package_show')(data_dict={'id': dataset.id}) for dataset in datasets] add_datasets_to_results(dataset_dicts, result) if len(datasets) > 1: stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets', '%s %s' % (stat_id, datasets[0].name)) elif len(datasets) == 0: stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU', stat_id) else: stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset', '%s %s' % (stat_id, datasets[0].name)) # Search for datasets in the catalogue datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name) if datasets is None: if not found_schema: stats.add('Search revealed none', stat_id) elif len(datasets) > 1: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets])) elif datasets: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name'])) else: if not found_schema: stats.add('No dataset for submission', stat_id) results.append(result) rows_with_datasets_count = \ len([result for result in results if any(result['dataset_schema_applied'])]) rows_with_datasets_or_candidate_datasets_count = \ len([result for result in results if result['dataset_schema_applied']]) if options.print_: print '\n Incentive stats\n' + stats_incentive.report() print '\n Overall stats\n' + stats.report() if options.write: print 'Writing' model.Session.commit() return {'table': results, 'rows_with_datasets_count': rows_with_datasets_count, 'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}