Exemplo n.º 1
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
    def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})

        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name,
                     pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
Exemplo n.º 3
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemplo n.º 4
0
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {
            'external_reference': 'ONSHUB',
            'state': 'active'
        }
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue
            if not self.loader._pkg_matches_search_options(
                    pkg, onshub_packages_search_options):
                log.error(
                    merge_stats.add('Did not match ONSHUB search after all',
                                    pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {
                'title': pkg['title'],
                'groups': pkg['groups'][0] if pkg['groups'] else '',
                'external_reference': 'ONSHUB',
                'state': 'active'
            }
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself',
                                          pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(
                        dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s',
                             pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r', pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add(
                    '%i duplicates found and merged' % len(dupe_pkgs),
                    pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
    def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})
        
        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
Exemplo n.º 6
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME,
                                       SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (
                theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package2(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0]['name'] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]['name']:
            print stats.add('Theme unchanged %s' % themes[0]['name'],
                            pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0]['name'],
                        pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemplo n.º 7
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='external_reference:ONSHUB !groups:["" TO *]',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(
                source_agency, self.client)
            if not publisher_name:
                log.error(
                    stats.add('Could not map source agency %s' % source_agency,
                              pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 8
0
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry = Registry()
    registry.prepare()
    translator_obj = MockTranslator()
    registry.register(translator, translator_obj)

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
Exemplo n.º 10
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
            SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]:
            print stats.add('Theme unchanged %s' % themes[0], pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0], pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemplo n.º 11
0
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {'external_reference': 'ONSHUB',
                                          'state': 'active'}
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue                
            if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options):
                log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {'title': pkg['title'],
                                   'groups': pkg['groups'][0] if pkg['groups'] else '',
                                   'external_reference': 'ONSHUB',
                                   'state': 'active'}
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself', pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r',
                         pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))
                

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 12
0
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='!external_reference:ONSHUB \"Source agency\"',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add(
                        'Could not find "Source agency: " line after all',
                        pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 13
0
def undelete(options):
    resources = _get_resources('deleted', options)
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'current_revision_fixer2'
    need_to_commit = False
    for res in resources:
        # when viewing old revision of the dataset, there is one where the
        # resources are not deleted but they don't show up. This is seen where resource_revision has an expired_timestamp that has no corresponding revision_timestamp - i.e. a gap between them (and that is not 9999-12-31).
        # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='373bb814-7a49-4f53-8a0e-762002b2529c' order by revision_timestamp;
        #      revision_timestamp     |     expired_timestamp      | current
        # ----------------------------+----------------------------+---------
        # 2013-06-19 00:50:28.880058 | 2014-01-18 01:03:47.500041 | f
        # 2014-01-18 01:03:47.500041 | 2014-01-18 01:03:48.296204 | f
        # 2014-01-18 01:03:50.612196 | 9999-12-31 00:00:00        | t
        # Clearly there is a gap from the 2nd to the 3rd, indicating the problem.
        res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all()
        if len(res_revs) < 2:
            print add_stat('Not enough revisions', res, stats)
            continue
        if res_revs[-2].expired_timestamp == res_revs[-1].revision_timestamp:
            add_stat('Ok', res, stats)
            continue
        print add_stat('Timestamp gap', res, stats)
        if options.write:
            res.state = 'active'
            need_to_commit = True
    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Exemplo n.º 14
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government':'level_of_government',
        }
    license_mapping = {
        # CS: bad_spelling ignore
        'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis',
        'Crown Copyright':'canada-crown',
        }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
Exemplo n.º 15
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government': 'level_of_government',
    }
    license_mapping = {
        # CS: bad_spelling ignore
        'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis',
        'Crown Copyright': 'canada-crown',
    }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
Exemplo n.º 16
0
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(stats.add('Ignore - "about" field does not reference this site',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                    '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(stats.add('Unable to find the package',
                                    '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(stats.add('Error fetching badge data - skipped',
                                   '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get('content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
Exemplo n.º 17
0
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '

        res = self.client.action('package_search',
                                 q='external_reference:ONSHUB \"%s\"' % prefix,
                                 sort='name asc',
                                 fq=' +site_id:"dgu" +state:active',
                                 wt='json',
                                 rows=100,
                                 escape_q=False)

        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 18
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq) / max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add(
            'margin %.1f' % winning_margin,
            '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
Exemplo n.º 19
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq)/max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
Exemplo n.º 20
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client)
            if not publisher_name:
                log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 21
0
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '
        
        res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 22
0
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not find "Source agency: " line after all', pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 23
0
def refix(options):
    resources = _get_resources('active', options)
    stats = StatsList()
    need_to_commit = False
    for res in resources:
        # the old uncommit command would set the wrong resource_revision to be current.
        # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='b2972b35-b6ae-4096-b8cc-40dab3927a71' order by revision_timestamp;
        #     revision_timestamp     |     expired_timestamp      | current
        # ---------------------------+----------------------------+---------i
        # 2013-04-13 01:47:30.18897  | 2013-06-18 19:01:45.910899 | f
        # 2013-06-18 19:01:45.910899 | 2014-01-18 08:55:41.443349 | t
        # 2014-01-18 08:55:41.443349 | 2014-01-18 08:55:41.566383 | f
        # Clearly only the latest should be current.
        res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all()
        fix_needed = False
        if len(res_revs) < 2:
            print add_stat('Not enought revisions', res, stats)
            continue
        for res_rev in res_revs[:-1]:
            if res_rev.current:
                print add_stat('Early revision is current', res, stats)
                fix_needed = True
                if options.write:
                    res_rev.current = False
                    need_to_commit = True
        if not res_revs[-1].current:
            print add_stat('Last revision is not current', res, stats)
            fix_needed = True
            if options.write:
                res_revs[-1].current = True
                need_to_commit = True
        if res_revs[-1].expired_timestamp != END_OF_TIME:
            print add_stat('Last revision is not 9999', res, stats)
            fix_needed = True
            if options.write:
                res_revs[-1].expired_timestamp = END_OF_TIME
                need_to_commit = True
        if not fix_needed:
            add_stat('Ok', res, stats)
            continue
    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Exemplo n.º 24
0
def no_current_packages(options):
    pkgs = _get_packages('active', options)
    stats = StatsList()
    need_to_commit = False
    for pkg in pkgs:
        latest_pkg_rev = \
            model.Session.query(model.PackageRevision) \
            .filter_by(id=pkg.id) \
            .order_by(model.PackageRevision.revision_timestamp.desc()) \
            .first()
        # sometimes a revision_timestamp is null for some reason
        if latest_pkg_rev.revision_timestamp is None:
            # in which case, join them to the revision table and order by those
            # timestamps instead
            latest_pkg_rev = \
                model.Session.query(model.PackageRevision) \
                .filter_by(id=pkg.id) \
                .join(model.Revision) \
                .order_by(model.Revision.timestamp.desc()) \
                .first()

        if not latest_pkg_rev.current:
            print stats.add('No current revision', pkg.name)
            if options.write:
                latest_pkg_rev.current = True
                need_to_commit = True
        else:
            stats.add('Ok', pkg.name)
        if latest_pkg_rev.revision_id != pkg.revision_id:
            print stats.add('Revision ID of package too old', pkg.name)
            if options.write:
                pkg.revision_id = latest_pkg_rev.revision_id
                need_to_commit = True

    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    print
Exemplo n.º 25
0
def no_current_resources(options):
    resources = _get_resources('active', options)
    stats = StatsList()
    need_to_commit = False
    for res in resources:
        latest_res_rev = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp.desc()).first()
        if not latest_res_rev.current:
            print add_stat('No current revision', res, stats)
            if options.write:
                latest_res_rev.current = True
                need_to_commit = True
        else:
            add_stat('Ok', res, stats)
        if latest_res_rev.revision_id != res.revision_id:
            print add_stat('Revision ID of resource too old', res, stats)
            if options.write:
                res.revision_id = latest_res_rev.revision_id
                need_to_commit = True

    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Exemplo n.º 26
0
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID', publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'], data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id)
                        print stats.add('Matched an alias for an entity - swapped them over', publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add('Matched an alias without a matching entity - created the entity')
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
Exemplo n.º 27
0
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(
                    full_site_url):
                self.log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())
Exemplo n.º 28
0
            warning += 'There is an editor(s) but not email addresses for them.'
        else:
            warning += 'There are no editors.'
    else:
        warning = None
    emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \
                        for user in users_with_email])
    names_without_email = ', '.join([user.fullname or user.name\
                                     for user in users_without_email])
    if warning:
        print pub_stats.add('%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title)
    else:
        print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title)
    row = ('PCT' if is_pct else '',
           pub.title, pub.name, emails, warning)
    if is_pct:
        pct_rows.append(row)
    else:
        non_pct_rows.append(row)

print pub_stats.report()

filename = 'nhs_emails.csv'
with open(filename, 'wb') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['PCT?', 'Publisher title', 'Publisher name', 'Emails', 'Warnings'])
    for row in pct_rows + non_pct_rows:
        csv_writer.writerow(row)
print filename
    emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \
                        for user in users_with_email])
    names_without_email = ', '.join([user.fullname or user.name\
                                     for user in users_without_email])
    if warning:
        print pub_stats.add(
            '%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning),
            pub.title)
    else:
        print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust',
                            pub.title)
    row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning)
    if is_pct:
        pct_rows.append(row)
    else:
        non_pct_rows.append(row)

print pub_stats.report()

filename = 'nhs_emails.csv'
with open(filename, 'wb') as csvfile:
    csv_writer = csv.writer(csvfile,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(
        ['PCT?', 'Publisher title', 'Publisher name', 'Emails', 'Warnings'])
    for row in pct_rows + non_pct_rows:
        csv_writer.writerow(row)
print filename
def command(dry_run=False):
    from ckan import model

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    global_log.info('Tidying package fields')

    stats = StatsList()

    if not dry_run:
        rev = model.repo.new_revision()
        rev.message = 'Package fields migration'

    for pkg in model.Session.query(model.Package)\
            .filter_by(state='active')\
            .order_by(model.Package.name):
        # field map
        for existing_fields, destination_field in field_map.items():
            value = pkg.extras.get(destination_field)
            if value:
                continue
            for existing_field in existing_fields:
                if hasattr(pkg, existing_field):
                    value = getattr(pkg, existing_field)
                else:
                    value = pkg.extras.get(existing_field)
                if value:
                    value = value.strip()
                    if value:
                        # take the first hit
                        continue
            if not dry_run:
                pkg.extras[destination_field] = value or ''
                # delete existing field values
                for existing_field in existing_fields:
                    if hasattr(pkg, existing_field):
                        setattr(pkg, existing_field, '')
                    elif existing_field in pkg.extras:
                        del pkg.extras[existing_field]
            if value:
                stats.add('Merged to field "%s"' % destination_field, pkg.name)
            else:
                stats.add('Not merged to field "%s"' % destination_field, pkg.name)

        # move url to additional resource
        if pkg.url:
            stats.add('Url moved to additional resource', value)
            if not dry_run:
                if not pkg.resource_groups:
                    res_group = model.ResourceGroup(label="default")
                    pkg.resource_groups.append(res_group)
                res_group = pkg.resource_groups[0]
                res = model.Resource(format='HTML', resource_type='documentation',
                                     url=pkg.url, description='Web page about the data')
                res_group.resources.append(res)
                model.Session.add(res)
                #pkg.url = ''
            stats.add('URL moved to additional resource', pkg.name)
        else:
            stats.add('No URL to move to additional resource', pkg.name)

        # delete fields
        for field in delete_fields:
            if field in pkg.extras:
                if not dry_run:
                    del pkg.extras[field]
                stats.add('Deleted field "%s"' % field, pkg.name)
            else:
                stats.add('No field to delete "%s"' % field, pkg.name)

    if not dry_run:
        model.repo.commit_and_remove()

    global_log.info(stats.report())
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s %r' % (about, entry['about'], entry['id'],
                                           entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(
                    stats.add(
                        'Error fetching badge data - skipped',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get(
                'content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url
                    or res.extras.get('cache_filepath')
                    or res.hash
                    or res.size
                    or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to',
                               'updated', 'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME,
                fields['first_failure'] or START_OF_TIME,
                fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res, stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(
            res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(
            model.ResourceRevision.url.ilike(
                '%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats,
                           'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' '  # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats,
                       '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'

        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = [
                '''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            '''
            ]
            for res_rev in res_revs:
                sql.append(
                    "DELETE from resource_revision where id='%s' and revision_id='%s';\n"
                    % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" %
                               res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()

        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # chunk revisions in chunks to cope when there are so many
        widgets = [
            'Creating SQL: ',
            Counter(),
            'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0),
            Bar(), ' ',
            ETA()
        ]
        progress2 = ProgressBar(widgets=widgets,
                                maxval=int(float(len(bad_res_revs)) / 1000.0)
                                or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp
                res_rev.expired_id = res_revs[i + 1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update(
            {'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'

    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Exemplo n.º 34
0
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id==options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org==org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')
    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:
        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore'))
        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to):
            stats_add('Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
Exemplo n.º 35
0
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'],
                                                         readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias,
                                          nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID',
                                publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias,
                                            nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add(
                        'Matched an alias for an entity which already has an ID - ignoring',
                        publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'],
                                             data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id,
                                         entity_id=entity.id)
                        print stats.add(
                            'Matched an alias for an entity - swapped them over',
                            publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add(
                    'Matched an alias without a matching entity - created the entity'
                )
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
Exemplo n.º 36
0
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(full_site_url):
                self.log.debug(stats.add('Ignore - "about" field does not reference this site',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(stats.add('Unable to find the package',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())
Exemplo n.º 37
0
def bulk_action(action=None,
                filepath=None,
                entity_or_alias_names=None,
                entities=True,
                aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []

    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(
                nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)

    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action == 'invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    from ckanext.qa.model import QA

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of QA from TaskStatus
        # to fill all properties of QA apart from:
        # * package_id
        # * resource_id
        fields = {}
        qa_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='qa')\
                                    .filter_by(key='status')\
                                    .first()
        if not qa_task_status:
            add_stat('No QA data', res, stats)
            continue
        qa_error = json.loads(qa_task_status.error)
        fields['openness_score'] = int(qa_task_status.value)
        fields['openness_score_reason'] = qa_error['reason']
        fields['format'] = qa_error['format']
        qa_date = qa_task_status.last_updated
        # NB qa_task_status.last_updated appears to be 1hr ahead of the revision
        # time, so some timezone nonesense going on. Can't do much.
        archival = Archival.get_for_resource(res.id)
        if not archival:
            print add_stat('QA but no Archival data', res, stats)
            continue
        archival_date = archival.updated
        # the state of the resource was as it was archived on the date of
        # the QA update but we only know when the latest archival was. So
        # if it was archived before the QA update thenwe know that was the
        # archival, otherwise we don't know when the relevant archival was.
        if archival_date and qa_date >= archival_date:
            fields['archival_timestamp'] = archival_date
            fields['updated'] = archival_date
            fields['created'] = archival_date
            # Assume the resource URL archived was the one when the
            # archival was done (it may not be if the URL was queued and
            # there was significant delay before it was archived)
            get_resource_as_at = archival_date
        else:
            # This is common for when a resource is created and qa runs just
            # before archiver and you get:
            # "This file had not been downloaded at the time of scoring it."
            # Just put sensible datetimes since we don't really know the exact
            # ones
            fields['archival_timestamp'] = qa_date
            fields['updated'] = qa_date
            fields['created'] = qa_date
            get_resource_as_at = qa_date
        res_rev = model.Session.query(model.ResourceRevision).\
            filter_by(id=res.id).\
            filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\
            order_by(model.ResourceRevision.revision_timestamp.desc()).\
            first()
        fields['resource_timestamp'] = res_rev.revision_timestamp

        # Compare with any existing data in the Archival table
        qa = QA.get_for_resource(res.id)
        if qa:
            changed = None
            for field, value in fields.items():
                if getattr(qa, field) != value:
                    if options.write:
                        setattr(qa, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in QA table', res, stats)
                continue
            add_stat('Updated in QA table', res, stats)
        else:
            qa = QA.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(qa, field, value)
                model.Session.add(qa)
            add_stat('Added to QA table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Exemplo n.º 39
0
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []
    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)
    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action=='invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
Exemplo n.º 40
0
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(
                ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(
                ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url or res.extras.get('cache_filepath')
                    or res.hash or res.size or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to', 'updated',
                               'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[
                -1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME, fields['first_failure']
                or START_OF_TIME, fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res,
                         stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id == options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org == org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')

    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:

        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg,
                             ('%s/%s %s' % (pkg.name, res.id, res.url)).encode(
                                 'latin7', 'ignore'))

        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(
                archival.url_redirected_to):
            stats_add(
                'Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
Exemplo n.º 42
0
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
            resource_id=options.resource, dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' ' # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'
        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = ['''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            ''']
            for res_rev in res_revs:
                sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()
        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i+n]
        # chunk revisions in chunks to cope when there are so many
        widgets = ['Creating SQL: ', Counter(),
                   'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(),
                   ' ', ETA()]
        progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i+1].revision_timestamp
                res_rev.expired_id = res_revs[i+1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'


    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'