def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry = Registry() registry.prepare() translator_obj = MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action( 'package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add( 'Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq) / max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add( 'margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error(stats.add('Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info(stats.add('Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get('content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0]['name'] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]['name']: print stats.add('Theme unchanged %s' % themes[0]['name'], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0]['name'], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq)/max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = { 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options( pkg, onshub_packages_search_options): log.error( merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = { 'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options( dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add( '%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def reconcile_aliases_that_match_entities_exactly(): '''When adding entities using this tool, they might also currently be in the recon queue. In cases there the alias name matches exactly the entity name, link them up. (Ideally we'd just delete the alias from the recon queue, but there is no delete_alias API.) ''' stats = StatsList() nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) for alias in nomen_data.unmatched_aliases: try: entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): try: nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id) except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error linking the alias to an entity: %s' % e, alias.name) continue print stats.add('Matched alias to an entity of the same name', alias.name) else: print stats.add('No matching entity', alias.name)
def reconcile_aliases_that_match_entities_exactly(): '''When adding entities using this tool, they might also currently be in the recon queue. In cases there the alias name matches exactly the entity name, link them up. (Ideally we'd just delete the alias from the recon queue, but there is no delete_alias API.) ''' stats = StatsList() nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) for alias in nomen_data.unmatched_aliases: try: entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): try: nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id) except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add( 'Server error linking the alias to an entity: %s' % e, alias.name) continue print stats.add('Matched alias to an entity of the same name', alias.name) else: print stats.add('No matching entity', alias.name)
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def add_missing_publisher(self): stats = StatsList() res = self.client.action( 'package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_( source_agency, self.client) if not publisher_name: log.error( stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = {'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options): log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = {'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government': 'level_of_government', } license_mapping = { # CS: bad_spelling ignore 'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis', 'Crown Copyright': 'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government':'level_of_government', } license_mapping = { # CS: bad_spelling ignore 'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis', 'Crown Copyright':'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def add_missing_publisher(self): stats = StatsList() res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client) if not publisher_name: log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def no_current_packages(options): pkgs = _get_packages('active', options) stats = StatsList() need_to_commit = False for pkg in pkgs: latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .order_by(model.PackageRevision.revision_timestamp.desc()) \ .first() # sometimes a revision_timestamp is null for some reason if latest_pkg_rev.revision_timestamp is None: # in which case, join them to the revision table and order by those # timestamps instead latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .join(model.Revision) \ .order_by(model.Revision.timestamp.desc()) \ .first() if not latest_pkg_rev.current: print stats.add('No current revision', pkg.name) if options.write: latest_pkg_rev.current = True need_to_commit = True else: stats.add('Ok', pkg.name) if latest_pkg_rev.revision_id != pkg.revision_id: print stats.add('Revision ID of package too old', pkg.name) if options.write: pkg.revision_id = latest_pkg_rev.revision_id need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written' print
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error( stats.add( 'Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info( stats.add( 'Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get( 'content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add('Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add('Matched an alias without a matching entity - created the entity') else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add( 'Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add( 'Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add( 'Matched an alias without a matching entity - created the entity' ) else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append( nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action == 'invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith(full_site_url): self.log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error(stats.add('Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())
else: users_without_email.append(user) if not users_with_email: if editors: warning += 'There is an editor(s) but not email addresses for them.' else: warning += 'There are no editors.' else: warning = None emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \ for user in users_with_email]) names_without_email = ', '.join([user.fullname or user.name\ for user in users_without_email]) if warning: print pub_stats.add( '%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title) else: print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title) row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning) if is_pct: pct_rows.append(row) else: non_pct_rows.append(row) print pub_stats.report() filename = 'nhs_emails.csv' with open(filename, 'wb') as csvfile: csv_writer = csv.writer(csvfile,
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action=='invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
users_with_email.append(user) else: users_without_email.append(user) if not users_with_email: if editors: warning += 'There is an editor(s) but not email addresses for them.' else: warning += 'There are no editors.' else: warning = None emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \ for user in users_with_email]) names_without_email = ', '.join([user.fullname or user.name\ for user in users_without_email]) if warning: print pub_stats.add('%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title) else: print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title) row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning) if is_pct: pct_rows.append(row) else: non_pct_rows.append(row) print pub_stats.report() filename = 'nhs_emails.csv' with open(filename, 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
def command(dry_run=False): from ckan import model # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) global_log.info('Tidying package fields') stats = StatsList() if not dry_run: rev = model.repo.new_revision() rev.message = 'Package fields migration' for pkg in model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.name): # field map for existing_fields, destination_field in field_map.items(): value = pkg.extras.get(destination_field) if value: continue for existing_field in existing_fields: if hasattr(pkg, existing_field): value = getattr(pkg, existing_field) else: value = pkg.extras.get(existing_field) if value: value = value.strip() if value: # take the first hit continue if not dry_run: pkg.extras[destination_field] = value or '' # delete existing field values for existing_field in existing_fields: if hasattr(pkg, existing_field): setattr(pkg, existing_field, '') elif existing_field in pkg.extras: del pkg.extras[existing_field] if value: stats.add('Merged to field "%s"' % destination_field, pkg.name) else: stats.add('Not merged to field "%s"' % destination_field, pkg.name) # move url to additional resource if pkg.url: stats.add('Url moved to additional resource', value) if not dry_run: if not pkg.resource_groups: res_group = model.ResourceGroup(label="default") pkg.resource_groups.append(res_group) res_group = pkg.resource_groups[0] res = model.Resource(format='HTML', resource_type='documentation', url=pkg.url, description='Web page about the data') res_group.resources.append(res) model.Session.add(res) #pkg.url = '' stats.add('URL moved to additional resource', pkg.name) else: stats.add('No URL to move to additional resource', pkg.name) # delete fields for field in delete_fields: if field in pkg.extras: if not dry_run: del pkg.extras[field] stats.add('Deleted field "%s"' % field, pkg.name) else: stats.add('No field to delete "%s"' % field, pkg.name) if not dry_run: model.repo.commit_and_remove() global_log.info(stats.report())
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith( full_site_url): self.log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error( stats.add( 'Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())