def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = { 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options( pkg, onshub_packages_search_options): log.error( merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = { 'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options( dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add( '%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0]['name'] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]['name']: print stats.add('Theme unchanged %s' % themes[0]['name'], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0]['name'], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def add_missing_publisher(self): stats = StatsList() res = self.client.action( 'package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_( source_agency, self.client) if not publisher_name: log.error( stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry = Registry() registry.prepare() translator_obj = MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = {'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options): log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = {'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action( 'package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add( 'Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def undelete(options): resources = _get_resources('deleted', options) stats = StatsList() if options.write: rev = model.repo.new_revision() rev.author = 'current_revision_fixer2' need_to_commit = False for res in resources: # when viewing old revision of the dataset, there is one where the # resources are not deleted but they don't show up. This is seen where resource_revision has an expired_timestamp that has no corresponding revision_timestamp - i.e. a gap between them (and that is not 9999-12-31). # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='373bb814-7a49-4f53-8a0e-762002b2529c' order by revision_timestamp; # revision_timestamp | expired_timestamp | current # ----------------------------+----------------------------+--------- # 2013-06-19 00:50:28.880058 | 2014-01-18 01:03:47.500041 | f # 2014-01-18 01:03:47.500041 | 2014-01-18 01:03:48.296204 | f # 2014-01-18 01:03:50.612196 | 9999-12-31 00:00:00 | t # Clearly there is a gap from the 2nd to the 3rd, indicating the problem. res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all() if len(res_revs) < 2: print add_stat('Not enough revisions', res, stats) continue if res_revs[-2].expired_timestamp == res_revs[-1].revision_timestamp: add_stat('Ok', res, stats) continue print add_stat('Timestamp gap', res, stats) if options.write: res.state = 'active' need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government':'level_of_government', } license_mapping = { # CS: bad_spelling ignore 'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis', 'Crown Copyright':'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government': 'level_of_government', } license_mapping = { # CS: bad_spelling ignore 'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis', 'Crown Copyright': 'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error(stats.add('Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info(stats.add('Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get('content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq) / max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add( 'margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq)/max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def add_missing_publisher(self): stats = StatsList() res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client) if not publisher_name: log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def refix(options): resources = _get_resources('active', options) stats = StatsList() need_to_commit = False for res in resources: # the old uncommit command would set the wrong resource_revision to be current. # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='b2972b35-b6ae-4096-b8cc-40dab3927a71' order by revision_timestamp; # revision_timestamp | expired_timestamp | current # ---------------------------+----------------------------+---------i # 2013-04-13 01:47:30.18897 | 2013-06-18 19:01:45.910899 | f # 2013-06-18 19:01:45.910899 | 2014-01-18 08:55:41.443349 | t # 2014-01-18 08:55:41.443349 | 2014-01-18 08:55:41.566383 | f # Clearly only the latest should be current. res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all() fix_needed = False if len(res_revs) < 2: print add_stat('Not enought revisions', res, stats) continue for res_rev in res_revs[:-1]: if res_rev.current: print add_stat('Early revision is current', res, stats) fix_needed = True if options.write: res_rev.current = False need_to_commit = True if not res_revs[-1].current: print add_stat('Last revision is not current', res, stats) fix_needed = True if options.write: res_revs[-1].current = True need_to_commit = True if res_revs[-1].expired_timestamp != END_OF_TIME: print add_stat('Last revision is not 9999', res, stats) fix_needed = True if options.write: res_revs[-1].expired_timestamp = END_OF_TIME need_to_commit = True if not fix_needed: add_stat('Ok', res, stats) continue print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def no_current_packages(options): pkgs = _get_packages('active', options) stats = StatsList() need_to_commit = False for pkg in pkgs: latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .order_by(model.PackageRevision.revision_timestamp.desc()) \ .first() # sometimes a revision_timestamp is null for some reason if latest_pkg_rev.revision_timestamp is None: # in which case, join them to the revision table and order by those # timestamps instead latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .join(model.Revision) \ .order_by(model.Revision.timestamp.desc()) \ .first() if not latest_pkg_rev.current: print stats.add('No current revision', pkg.name) if options.write: latest_pkg_rev.current = True need_to_commit = True else: stats.add('Ok', pkg.name) if latest_pkg_rev.revision_id != pkg.revision_id: print stats.add('Revision ID of package too old', pkg.name) if options.write: pkg.revision_id = latest_pkg_rev.revision_id need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written' print
def no_current_resources(options): resources = _get_resources('active', options) stats = StatsList() need_to_commit = False for res in resources: latest_res_rev = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp.desc()).first() if not latest_res_rev.current: print add_stat('No current revision', res, stats) if options.write: latest_res_rev.current = True need_to_commit = True else: add_stat('Ok', res, stats) if latest_res_rev.revision_id != res.revision_id: print add_stat('Revision ID of resource too old', res, stats) if options.write: res.revision_id = latest_res_rev.revision_id need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add('Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add('Matched an alias without a matching entity - created the entity') else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith( full_site_url): self.log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error( stats.add( 'Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())
warning += 'There is an editor(s) but not email addresses for them.' else: warning += 'There are no editors.' else: warning = None emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \ for user in users_with_email]) names_without_email = ', '.join([user.fullname or user.name\ for user in users_without_email]) if warning: print pub_stats.add('%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title) else: print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title) row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning) if is_pct: pct_rows.append(row) else: non_pct_rows.append(row) print pub_stats.report() filename = 'nhs_emails.csv' with open(filename, 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['PCT?', 'Publisher title', 'Publisher name', 'Emails', 'Warnings']) for row in pct_rows + non_pct_rows: csv_writer.writerow(row) print filename
emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \ for user in users_with_email]) names_without_email = ', '.join([user.fullname or user.name\ for user in users_without_email]) if warning: print pub_stats.add( '%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title) else: print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title) row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning) if is_pct: pct_rows.append(row) else: non_pct_rows.append(row) print pub_stats.report() filename = 'nhs_emails.csv' with open(filename, 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow( ['PCT?', 'Publisher title', 'Publisher name', 'Emails', 'Warnings']) for row in pct_rows + non_pct_rows: csv_writer.writerow(row) print filename
def command(dry_run=False): from ckan import model # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) global_log.info('Tidying package fields') stats = StatsList() if not dry_run: rev = model.repo.new_revision() rev.message = 'Package fields migration' for pkg in model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.name): # field map for existing_fields, destination_field in field_map.items(): value = pkg.extras.get(destination_field) if value: continue for existing_field in existing_fields: if hasattr(pkg, existing_field): value = getattr(pkg, existing_field) else: value = pkg.extras.get(existing_field) if value: value = value.strip() if value: # take the first hit continue if not dry_run: pkg.extras[destination_field] = value or '' # delete existing field values for existing_field in existing_fields: if hasattr(pkg, existing_field): setattr(pkg, existing_field, '') elif existing_field in pkg.extras: del pkg.extras[existing_field] if value: stats.add('Merged to field "%s"' % destination_field, pkg.name) else: stats.add('Not merged to field "%s"' % destination_field, pkg.name) # move url to additional resource if pkg.url: stats.add('Url moved to additional resource', value) if not dry_run: if not pkg.resource_groups: res_group = model.ResourceGroup(label="default") pkg.resource_groups.append(res_group) res_group = pkg.resource_groups[0] res = model.Resource(format='HTML', resource_type='documentation', url=pkg.url, description='Web page about the data') res_group.resources.append(res) model.Session.add(res) #pkg.url = '' stats.add('URL moved to additional resource', pkg.name) else: stats.add('No URL to move to additional resource', pkg.name) # delete fields for field in delete_fields: if field in pkg.extras: if not dry_run: del pkg.extras[field] stats.add('Deleted field "%s"' % field, pkg.name) else: stats.add('No field to delete "%s"' % field, pkg.name) if not dry_run: model.repo.commit_and_remove() global_log.info(stats.report())
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error( stats.add( 'Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info( stats.add( 'Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get( 'content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get( res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter( model.ResourceRevision.url.ilike( '%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = [ '''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; ''' ] for res_rev in res_revs: sql.append( "DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i + n] # chunk revisions in chunks to cope when there are so many widgets = [ 'Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0), Bar(), ' ', ETA() ] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs)) / 1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp: res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp res_rev.expired_id = res_revs[i + 1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update( {'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id==options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org==org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to): stats_add('Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add( 'Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add( 'Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add( 'Matched an alias without a matching entity - created the entity' ) else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith(full_site_url): self.log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error(stats.add('Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append( nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action == 'invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
def migrate(options): from ckan import model from ckanext.archiver.model import Archival from ckanext.qa.model import QA resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of QA from TaskStatus # to fill all properties of QA apart from: # * package_id # * resource_id fields = {} qa_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='qa')\ .filter_by(key='status')\ .first() if not qa_task_status: add_stat('No QA data', res, stats) continue qa_error = json.loads(qa_task_status.error) fields['openness_score'] = int(qa_task_status.value) fields['openness_score_reason'] = qa_error['reason'] fields['format'] = qa_error['format'] qa_date = qa_task_status.last_updated # NB qa_task_status.last_updated appears to be 1hr ahead of the revision # time, so some timezone nonesense going on. Can't do much. archival = Archival.get_for_resource(res.id) if not archival: print add_stat('QA but no Archival data', res, stats) continue archival_date = archival.updated # the state of the resource was as it was archived on the date of # the QA update but we only know when the latest archival was. So # if it was archived before the QA update thenwe know that was the # archival, otherwise we don't know when the relevant archival was. if archival_date and qa_date >= archival_date: fields['archival_timestamp'] = archival_date fields['updated'] = archival_date fields['created'] = archival_date # Assume the resource URL archived was the one when the # archival was done (it may not be if the URL was queued and # there was significant delay before it was archived) get_resource_as_at = archival_date else: # This is common for when a resource is created and qa runs just # before archiver and you get: # "This file had not been downloaded at the time of scoring it." # Just put sensible datetimes since we don't really know the exact # ones fields['archival_timestamp'] = qa_date fields['updated'] = qa_date fields['created'] = qa_date get_resource_as_at = qa_date res_rev = model.Session.query(model.ResourceRevision).\ filter_by(id=res.id).\ filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\ order_by(model.ResourceRevision.revision_timestamp.desc()).\ first() fields['resource_timestamp'] = res_rev.revision_timestamp # Compare with any existing data in the Archival table qa = QA.get_for_resource(res.id) if qa: changed = None for field, value in fields.items(): if getattr(qa, field) != value: if options.write: setattr(qa, field, value) changed = True if not changed: add_stat('Already exists correctly in QA table', res, stats) continue add_stat('Updated in QA table', res, stats) else: qa = QA.create(res.id) if options.write: for field, value in fields.items(): setattr(qa, field, value) model.Session.add(qa) add_stat('Added to QA table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action=='invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id == options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org == org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode( 'latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive( archival.url_redirected_to): stats_add( 'Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get(res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = ['''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; '''] for res_rev in res_revs: sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i+n] # chunk revisions in chunks to cope when there are so many widgets = ['Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(), ' ', ETA()] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp: res_rev.expired_timestamp = res_revs[i+1].revision_timestamp res_rev.expired_id = res_revs[i+1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'