def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})

        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name,
                     pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
    def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})
        
        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry = Registry()
    registry.prepare()
    translator_obj = MockTranslator()
    registry.register(translator, translator_obj)

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
Exemplo n.º 4
0
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
Exemplo n.º 5
0
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='!external_reference:ONSHUB \"Source agency\"',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add(
                        'Could not find "Source agency: " line after all',
                        pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 6
0
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '

        res = self.client.action('package_search',
                                 q='external_reference:ONSHUB \"%s\"' % prefix,
                                 sort='name asc',
                                 fq=' +site_id:"dgu" +state:active',
                                 wt='json',
                                 rows=100,
                                 escape_q=False)

        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 7
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq) / max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add(
            'margin %.1f' % winning_margin,
            '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(stats.add('Ignore - "about" field does not reference this site',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                    '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(stats.add('Unable to find the package',
                                    '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(stats.add('Error fetching badge data - skipped',
                                   '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get('content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
Exemplo n.º 9
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME,
                                       SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (
                theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package2(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0]['name'] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]['name']:
            print stats.add('Theme unchanged %s' % themes[0]['name'],
                            pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0]['name'],
                        pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemplo n.º 10
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq)/max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
Exemplo n.º 11
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
            SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]:
            print stats.add('Theme unchanged %s' % themes[0], pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0], pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemplo n.º 12
0
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {
            'external_reference': 'ONSHUB',
            'state': 'active'
        }
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue
            if not self.loader._pkg_matches_search_options(
                    pkg, onshub_packages_search_options):
                log.error(
                    merge_stats.add('Did not match ONSHUB search after all',
                                    pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {
                'title': pkg['title'],
                'groups': pkg['groups'][0] if pkg['groups'] else '',
                'external_reference': 'ONSHUB',
                'state': 'active'
            }
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself',
                                          pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(
                        dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s',
                             pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r', pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add(
                    '%i duplicates found and merged' % len(dupe_pkgs),
                    pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 13
0
def reconcile_aliases_that_match_entities_exactly():
    '''When adding entities using this tool, they might also currently be in
    the recon queue. In cases there the alias name matches exactly the entity
    name, link them up.

    (Ideally we'd just delete the alias from the recon queue, but there is no
    delete_alias API.)
    '''
    stats = StatsList()
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)
    for alias in nomen_data.unmatched_aliases:
        try:
            entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
            try:
                nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id)
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error linking the alias to an entity: %s' % e, alias.name)
                continue
            print stats.add('Matched alias to an entity of the same name', alias.name)
        else:
            print stats.add('No matching entity', alias.name)
Exemplo n.º 14
0
def reconcile_aliases_that_match_entities_exactly():
    '''When adding entities using this tool, they might also currently be in
    the recon queue. In cases there the alias name matches exactly the entity
    name, link them up.

    (Ideally we'd just delete the alias from the recon queue, but there is no
    delete_alias API.)
    '''
    stats = StatsList()
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)
    for alias in nomen_data.unmatched_aliases:
        try:
            entity_or_alias = nk_dataset.lookup_detailed(alias.name,
                                                         readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        if entity_or_alias and isinstance(entity_or_alias,
                                          nomenklatura.Entity):
            try:
                nk_dataset.match(alias_id=alias.id,
                                 entity_id=entity_or_alias.id)
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add(
                    'Server error linking the alias to an entity: %s' % e,
                    alias.name)
                continue
            print stats.add('Matched alias to an entity of the same name',
                            alias.name)
        else:
            print stats.add('No matching entity', alias.name)
Exemplo n.º 15
0
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '
        
        res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 16
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='external_reference:ONSHUB !groups:["" TO *]',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(
                source_agency, self.client)
            if not publisher_name:
                log.error(
                    stats.add('Could not map source agency %s' % source_agency,
                              pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 17
0
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not find "Source agency: " line after all', pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 18
0
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {'external_reference': 'ONSHUB',
                                          'state': 'active'}
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue                
            if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options):
                log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {'title': pkg['title'],
                                   'groups': pkg['groups'][0] if pkg['groups'] else '',
                                   'external_reference': 'ONSHUB',
                                   'state': 'active'}
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself', pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r',
                         pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))
                

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 19
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government': 'level_of_government',
    }
    license_mapping = {
        # CS: bad_spelling ignore
        'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis',
        'Crown Copyright': 'canada-crown',
    }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
Exemplo n.º 20
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government':'level_of_government',
        }
    license_mapping = {
        # CS: bad_spelling ignore
        'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis',
        'Crown Copyright':'canada-crown',
        }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
Exemplo n.º 21
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client)
            if not publisher_name:
                log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemplo n.º 22
0
def no_current_packages(options):
    pkgs = _get_packages('active', options)
    stats = StatsList()
    need_to_commit = False
    for pkg in pkgs:
        latest_pkg_rev = \
            model.Session.query(model.PackageRevision) \
            .filter_by(id=pkg.id) \
            .order_by(model.PackageRevision.revision_timestamp.desc()) \
            .first()
        # sometimes a revision_timestamp is null for some reason
        if latest_pkg_rev.revision_timestamp is None:
            # in which case, join them to the revision table and order by those
            # timestamps instead
            latest_pkg_rev = \
                model.Session.query(model.PackageRevision) \
                .filter_by(id=pkg.id) \
                .join(model.Revision) \
                .order_by(model.Revision.timestamp.desc()) \
                .first()

        if not latest_pkg_rev.current:
            print stats.add('No current revision', pkg.name)
            if options.write:
                latest_pkg_rev.current = True
                need_to_commit = True
        else:
            stats.add('Ok', pkg.name)
        if latest_pkg_rev.revision_id != pkg.revision_id:
            print stats.add('Revision ID of package too old', pkg.name)
            if options.write:
                pkg.revision_id = latest_pkg_rev.revision_id
                need_to_commit = True

    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    print
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s %r' % (about, entry['about'], entry['id'],
                                           entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(
                    stats.add(
                        'Error fetching badge data - skipped',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get(
                'content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
Exemplo n.º 24
0
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID', publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'], data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id)
                        print stats.add('Matched an alias for an entity - swapped them over', publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add('Matched an alias without a matching entity - created the entity')
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
Exemplo n.º 25
0
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'],
                                                         readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias,
                                          nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID',
                                publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias,
                                            nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add(
                        'Matched an alias for an entity which already has an ID - ignoring',
                        publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'],
                                             data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id,
                                         entity_id=entity.id)
                        print stats.add(
                            'Matched an alias for an entity - swapped them over',
                            publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add(
                    'Matched an alias without a matching entity - created the entity'
                )
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
Exemplo n.º 26
0
def bulk_action(action=None,
                filepath=None,
                entity_or_alias_names=None,
                entities=True,
                aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []

    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(
                nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)

    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action == 'invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
Exemplo n.º 27
0
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(full_site_url):
                self.log.debug(stats.add('Ignore - "about" field does not reference this site',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(stats.add('Unable to find the package',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())
        else:
            users_without_email.append(user)
    if not users_with_email:
        if editors:
            warning += 'There is an editor(s) but not email addresses for them.'
        else:
            warning += 'There are no editors.'
    else:
        warning = None
    emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \
                        for user in users_with_email])
    names_without_email = ', '.join([user.fullname or user.name\
                                     for user in users_without_email])
    if warning:
        print pub_stats.add(
            '%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning),
            pub.title)
    else:
        print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust',
                            pub.title)
    row = ('PCT' if is_pct else '', pub.title, pub.name, emails, warning)
    if is_pct:
        pct_rows.append(row)
    else:
        non_pct_rows.append(row)

print pub_stats.report()

filename = 'nhs_emails.csv'
with open(filename, 'wb') as csvfile:
    csv_writer = csv.writer(csvfile,
Exemplo n.º 29
0
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []
    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)
    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action=='invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
Exemplo n.º 30
0
            users_with_email.append(user)
        else:
            users_without_email.append(user)
    if not users_with_email:
        if editors:
            warning += 'There is an editor(s) but not email addresses for them.'
        else:
            warning += 'There are no editors.'
    else:
        warning = None
    emails = ', '.join(['%s <%s>' % (user.fullname, get_email_for_user(user)) \
                        for user in users_with_email])
    names_without_email = ', '.join([user.fullname or user.name\
                                     for user in users_without_email])
    if warning:
        print pub_stats.add('%s without emails: %s' % ('PCT' if is_pct else 'Trust', warning), pub.title)
    else:
        print pub_stats.add('%s with emails' % 'PCT' if is_pct else 'Trust', pub.title)
    row = ('PCT' if is_pct else '',
           pub.title, pub.name, emails, warning)
    if is_pct:
        pct_rows.append(row)
    else:
        non_pct_rows.append(row)

print pub_stats.report()

filename = 'nhs_emails.csv'
with open(filename, 'wb') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
def command(dry_run=False):
    from ckan import model

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    global_log.info('Tidying package fields')

    stats = StatsList()

    if not dry_run:
        rev = model.repo.new_revision()
        rev.message = 'Package fields migration'

    for pkg in model.Session.query(model.Package)\
            .filter_by(state='active')\
            .order_by(model.Package.name):
        # field map
        for existing_fields, destination_field in field_map.items():
            value = pkg.extras.get(destination_field)
            if value:
                continue
            for existing_field in existing_fields:
                if hasattr(pkg, existing_field):
                    value = getattr(pkg, existing_field)
                else:
                    value = pkg.extras.get(existing_field)
                if value:
                    value = value.strip()
                    if value:
                        # take the first hit
                        continue
            if not dry_run:
                pkg.extras[destination_field] = value or ''
                # delete existing field values
                for existing_field in existing_fields:
                    if hasattr(pkg, existing_field):
                        setattr(pkg, existing_field, '')
                    elif existing_field in pkg.extras:
                        del pkg.extras[existing_field]
            if value:
                stats.add('Merged to field "%s"' % destination_field, pkg.name)
            else:
                stats.add('Not merged to field "%s"' % destination_field, pkg.name)

        # move url to additional resource
        if pkg.url:
            stats.add('Url moved to additional resource', value)
            if not dry_run:
                if not pkg.resource_groups:
                    res_group = model.ResourceGroup(label="default")
                    pkg.resource_groups.append(res_group)
                res_group = pkg.resource_groups[0]
                res = model.Resource(format='HTML', resource_type='documentation',
                                     url=pkg.url, description='Web page about the data')
                res_group.resources.append(res)
                model.Session.add(res)
                #pkg.url = ''
            stats.add('URL moved to additional resource', pkg.name)
        else:
            stats.add('No URL to move to additional resource', pkg.name)

        # delete fields
        for field in delete_fields:
            if field in pkg.extras:
                if not dry_run:
                    del pkg.extras[field]
                stats.add('Deleted field "%s"' % field, pkg.name)
            else:
                stats.add('No field to delete "%s"' % field, pkg.name)

    if not dry_run:
        model.repo.commit_and_remove()

    global_log.info(stats.report())
Exemplo n.º 32
0
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(
                    full_site_url):
                self.log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())