Exemplo n.º 1
0
def tagless_report(organization, include_sub_organizations=False):
    '''
    Produces a report on packages without tags.
    Returns something like this:
        {
         'table': [
            {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******',
             'created': '2008-06-13T10:24:59.435631'},
            {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******',
             'created': '2009-12-14T08:42:45.473827'},
            ],
         'num_packages': 56,
         'packages_without_tags_percent': 4,
         'average_tags_per_package': 3.5,
        }
    '''
    # Find the packages without tags
    q = model.Session.query(model.Package) \
        .outerjoin(model.PackageTag) \
        .filter(
        model.PackageTag.id == None  # noqa: E711
    )
    if organization:
        q = lib.filter_by_organizations(q, organization,
                                        include_sub_organizations)
    tagless_pkgs = [
        OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('notes', lib.dataset_notes(pkg)),
            ('user', pkg.creator_user_id),
            ('created', pkg.metadata_created.isoformat()),
        )) for pkg in q.slice(0, 100)
    ]  # First 100 only for this demo

    # Average number of tags per package
    q = model.Session.query(model.Package)
    q = lib.filter_by_organizations(q, organization, include_sub_organizations)
    num_packages = q.count()
    q = q.join(model.PackageTag)
    num_taggings = q.count()
    if num_packages:
        average_tags_per_package = round(float(num_taggings) / num_packages, 1)
    else:
        average_tags_per_package = None
    packages_without_tags_percent = lib.percent(len(tagless_pkgs),
                                                num_packages)

    return {
        'table': tagless_pkgs,
        'num_packages': num_packages,
        'packages_without_tags_percent': packages_without_tags_percent,
        'average_tags_per_package': average_tags_per_package,
    }
Exemplo n.º 2
0
def tagless_report(organization, include_sub_organizations=False):
    '''
    Produces a report on packages without tags.
    Returns something like this:
        {
         'table': [
            {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'},  # noqa
            {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'},  # noqa
            ],
         'num_packages': 56,
         'packages_without_tags_percent': 4,
         'average_tags_per_package': 3.5,
        }
    '''
    # Find the packages without tags
    q = model.Session.query(model.Package) \
             .outerjoin(model.PackageTag) \
             .filter(model.PackageTag.id == None)
    if organization:
        q = lib.filter_by_organizations(q, organization,
                                        include_sub_organizations)
    tagless_pkgs = [OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('notes', lib.dataset_notes(pkg)),
            ('user', pkg.creator_user_id),
            ('created', pkg.metadata_created.isoformat()),
            )) for pkg in q.slice(0, 100)]  # First 100 only for this demo

    # Average number of tags per package
    q = model.Session.query(model.Package)
    q = lib.filter_by_organizations(q, organization, include_sub_organizations)
    num_packages = q.count()
    q = q.join(model.PackageTag)
    num_taggings = q.count()
    if num_packages:
        average_tags_per_package = round(float(num_taggings) / num_packages, 1)
    else:
        average_tags_per_package = None
    packages_without_tags_percent = lib.percent(
        len(tagless_pkgs), num_packages)

    return {
        'table': tagless_pkgs,
        'num_packages': num_packages,
        'packages_without_tags_percent': packages_without_tags_percent,
        'average_tags_per_package': average_tags_per_package,
    }
Exemplo n.º 3
0
def broken_links_index(include_sub_organizations=False):
    '''Returns the count of broken links for all organizations.'''

    from ckanext.archiver.model import Archival

    counts = {}
    # Get all the broken datasets and build up the results by org
    orgs = model.Session.query(model.Group)\
        .filter(model.Group.type == 'organization')\
        .filter(model.Group.state == 'active').all()
    for org in add_progress_bar(
            orgs, 'Part 1/2' if include_sub_organizations else None):
        archivals = model.Session.query(Archival)\
            .filter(Archival.is_broken == True)\
            .join(model.Package, Archival.package_id == model.Package.id)\
            .filter(model.Package.owner_org == org.id)\
            .filter(model.Package.state == 'active')\
            .join(model.Resource, Archival.resource_id == model.Resource.id)\
            .filter(model.Resource.state == 'active')
        broken_resources = archivals.count()
        broken_datasets = archivals.distinct(model.Package.id).count()
        num_datasets = model.Session.query(model.Package)\
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')\
            .count()
        num_resources = model.Session.query(model.Package)\
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')
        if p.toolkit.check_ckan_version(max_version='2.2.99'):
            num_resources = num_resources.join(model.ResourceGroup)
        num_resources = num_resources \
            .join(model.Resource)\
            .filter_by(state='active')\
            .count()
        counts[org.name] = {
            'organization_title': org.title,
            'broken_packages': broken_datasets,
            'broken_resources': broken_resources,
            'packages': num_datasets,
            'resources': num_resources
        }

    counts_with_sub_orgs = copy.deepcopy(counts)  # new dict
    if include_sub_organizations:
        for org_name in add_progress_bar(counts_with_sub_orgs, 'Part 2/2'):
            org = model.Group.by_name(org_name)

            for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \
                    in org.get_children_group_hierarchy(type='organization'):
                if sub_org_name not in counts:
                    # occurs only if there is an organization created since the last loop?
                    continue
                counts_with_sub_orgs[org_name]['broken_packages'] += \
                        counts[sub_org_name]['broken_packages']
                counts_with_sub_orgs[org_name]['broken_resources'] += \
                        counts[sub_org_name]['broken_resources']
                counts_with_sub_orgs[org_name]['packages'] += \
                        counts[sub_org_name]['packages']
                counts_with_sub_orgs[org_name]['resources'] += \
                        counts[sub_org_name]['resources']
        results = counts_with_sub_orgs
    else:
        results = counts

    data = []
    num_broken_packages = 0
    num_broken_resources = 0
    num_packages = 0
    num_resources = 0
    for org_name, org_counts in results.iteritems():
        data.append(OrderedDict((
            ('organization_title', results[org_name]['organization_title']),
            ('organization_name', org_name),
            ('package_count', org_counts['packages']),
            ('resource_count', org_counts['resources']),
            ('broken_package_count', org_counts['broken_packages']),
            ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])),
            ('broken_resource_count', org_counts['broken_resources']),
            ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])),
            )))
        # Totals - always use the counts, rather than counts_with_sub_orgs, to
        # avoid counting a package in both its org and parent org
        org_counts_ = counts[org_name]
        num_broken_packages += org_counts_['broken_packages']
        num_broken_resources += org_counts_['broken_resources']
        num_packages += org_counts_['packages']
        num_resources += org_counts_['resources']

    data.sort(key=lambda x: (-x['broken_package_count'],
                             -x['broken_resource_count']))

    return {'table': data,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            }
Exemplo n.º 4
0
def broken_links_for_organization(organization, include_sub_organizations=False):
    '''
    Returns a dictionary detailing broken resource links for the organization
    or if organization it returns the index page for all organizations.

    params:
      organization - name of an organization

    Returns:
    {'organization_name': 'cabinet-office',
     'organization_title:': 'Cabinet Office',
     'table': [
       {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'}
      ...]

    '''
    from ckanext.archiver.model import Archival

    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound()

    name = org.name
    title = org.title

    archivals = model.Session.query(Archival, model.Package, model.Group).\
        filter(Archival.is_broken == True).\
        join(model.Package, Archival.package_id == model.Package.id).\
        filter(model.Package.state == 'active').\
        join(model.Resource, Archival.resource_id == model.Resource.id).\
        filter(model.Resource.state == 'active')

    if not include_sub_organizations:
        org_ids = [org.id]
        archivals = archivals.filter(model.Package.owner_org == org.id)
    else:
        # We want any organization_id that is part of this organization's tree
        org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)]
        archivals = archivals.filter(model.Package.owner_org.in_(org_ids))

    archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)

    results = []

    for archival, pkg, org in archivals.all():
        pkg = model.Package.get(archival.package_id)
        resource = model.Resource.get(archival.resource_id)

        via = ''
        er = pkg.extras.get('external_reference', '')
        if er == 'ONSHUB':
            via = "Stats Hub"
        elif er.startswith("DATA4NR"):
            via = "Data4nr"

        archived_resource = model.Session.query(model.ResourceRevision)\
                            .filter_by(id=resource.id)\
                            .filter_by(revision_timestamp=archival.resource_timestamp)\
                            .first() or resource
        row_data = OrderedDict((
            ('dataset_title', pkg.title),
            ('dataset_name', pkg.name),
            ('dataset_notes', lib.dataset_notes(pkg)),
            ('organization_title', org.title),
            ('organization_name', org.name),
            ('resource_position', resource.position),
            ('resource_id', resource.id),
            ('resource_url', archived_resource.url),
            ('url_up_to_date', resource.url == archived_resource.url),
            ('via', via),
            ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
            ('last_updated', archival.updated.isoformat() if archival.updated else None),
            ('last_success', archival.last_success.isoformat() if archival.last_success else None),
            ('url_redirected_to', archival.url_redirected_to),
            ('reason', archival.reason),
            ('status', archival.status),
            ('failure_count', archival.failure_count),
            ))

        results.append(row_data)

    num_broken_packages = archivals.distinct(model.Package.name).count()
    num_broken_resources = len(results)

    # Get total number of packages & resources
    num_packages = model.Session.query(model.Package)\
                        .filter(model.Package.owner_org.in_(org_ids))\
                        .filter_by(state='active')\
                        .count()
    num_resources = model.Session.query(model.Resource)\
                         .filter_by(state='active')
    if p.toolkit.check_ckan_version(max_version='2.2.99'):
        num_resources = num_resources.join(model.ResourceGroup)
    num_resources = num_resources \
        .join(model.Package)\
        .filter(model.Package.owner_org.in_(org_ids))\
        .filter_by(state='active').count()

    return {'organization_name': name,
            'organization_title': title,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            'table': results}
Exemplo n.º 5
0
def broken_links_index(include_sub_organizations=False):
    '''Returns the count of broken links for all organizations.'''

    from ckanext.archiver.model import Archival

    counts = {}
    # Get all the broken datasets and build up the results by org
    orgs = model.Session.query(model.Group)\
        .filter(model.Group.type == 'organization')\
        .filter(model.Group.state == 'active').all()
    for org in add_progress_bar(
            orgs, 'Part 1/2' if include_sub_organizations else None):
        archivals = model.Session.query(Archival)\
            .filter(Archival.is_broken == True)\
            .join(model.Package, Archival.package_id == model.Package.id)\
            .filter(model.Package.owner_org == org.id)\
            .filter(model.Package.state == 'active')\
            .join(model.Resource, Archival.resource_id == model.Resource.id)\
            .filter(model.Resource.state == 'active')
        broken_resources = archivals.count()
        broken_datasets = archivals.distinct(model.Package.id).count()
        num_datasets = model.Session.query(model.Package)\
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')\
            .count()
        num_resources = model.Session.query(model.Package)\
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')
        if p.toolkit.check_ckan_version(max_version='2.2.99'):
            num_resources = num_resources.join(model.ResourceGroup)
        num_resources = num_resources \
            .join(model.Resource)\
            .filter_by(state='active')\
            .count()
        counts[org.name] = {
            'organization_title': org.title,
            'broken_packages': broken_datasets,
            'broken_resources': broken_resources,
            'packages': num_datasets,
            'resources': num_resources
        }

    counts_with_sub_orgs = copy.deepcopy(counts)  # new dict
    if include_sub_organizations:
        for org_name in add_progress_bar(counts_with_sub_orgs, 'Part 2/2'):
            org = model.Group.by_name(org_name)

            for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \
                    in org.get_children_group_hierarchy(type='organization'):
                if sub_org_name not in counts:
                    # occurs only if there is an organization created since the last loop?
                    continue
                counts_with_sub_orgs[org_name]['broken_packages'] += \
                        counts[sub_org_name]['broken_packages']
                counts_with_sub_orgs[org_name]['broken_resources'] += \
                        counts[sub_org_name]['broken_resources']
                counts_with_sub_orgs[org_name]['packages'] += \
                        counts[sub_org_name]['packages']
                counts_with_sub_orgs[org_name]['resources'] += \
                        counts[sub_org_name]['resources']
        results = counts_with_sub_orgs
    else:
        results = counts

    data = []
    num_broken_packages = 0
    num_broken_resources = 0
    num_packages = 0
    num_resources = 0
    for org_name, org_counts in results.iteritems():
        data.append(OrderedDict((
            ('organization_title', results[org_name]['organization_title']),
            ('organization_name', org_name),
            ('package_count', org_counts['packages']),
            ('resource_count', org_counts['resources']),
            ('broken_package_count', org_counts['broken_packages']),
            ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])),
            ('broken_resource_count', org_counts['broken_resources']),
            ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])),
            )))
        # Totals - always use the counts, rather than counts_with_sub_orgs, to
        # avoid counting a package in both its org and parent org
        org_counts_ = counts[org_name]
        num_broken_packages += org_counts_['broken_packages']
        num_broken_resources += org_counts_['broken_resources']
        num_packages += org_counts_['packages']
        num_resources += org_counts_['resources']

    data.sort(key=lambda x: (-x['broken_package_count'],
                             -x['broken_resource_count']))

    return {'table': data,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            }
Exemplo n.º 6
0
def broken_links_for_organization(organization, include_sub_organizations=False):
    '''
    Returns a dictionary detailing broken resource links for the organization
    or if organization it returns the index page for all organizations.

    params:
      organization - name of an organization

    Returns:
    {'organization_name': 'cabinet-office',
     'organization_title:': 'Cabinet Office',
     'table': [
       {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'}
      ...]

    '''
    from ckanext.archiver.model import Archival

    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound()

    name = org.name
    title = org.title

    archivals = model.Session.query(Archival, model.Package, model.Group).\
        filter(Archival.is_broken == True).\
        join(model.Package, Archival.package_id == model.Package.id).\
        filter(model.Package.state == 'active').\
        join(model.Resource, Archival.resource_id == model.Resource.id).\
        filter(model.Resource.state == 'active')

    if not include_sub_organizations:
        org_ids = [org.id]
        archivals = archivals.filter(model.Package.owner_org == org.id)
    else:
        # We want any organization_id that is part of this organization's tree
        org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)]
        archivals = archivals.filter(model.Package.owner_org.in_(org_ids))

    archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)

    results = []

    for archival, pkg, org in archivals.all():
        pkg = model.Package.get(archival.package_id)
        resource = model.Resource.get(archival.resource_id)

        via = ''
        er = pkg.extras.get('external_reference', '')
        if er == 'ONSHUB':
            via = "Stats Hub"
        elif er.startswith("DATA4NR"):
            via = "Data4nr"

        archived_resource = model.Session.query(model.ResourceRevision)\
                            .filter_by(id=resource.id)\
                            .filter_by(revision_timestamp=archival.resource_timestamp)\
                            .first() or resource
        row_data = OrderedDict((
            ('dataset_title', pkg.title),
            ('dataset_name', pkg.name),
            ('dataset_notes', lib.dataset_notes(pkg)),
            ('organization_title', org.title),
            ('organization_name', org.name),
            ('resource_position', resource.position),
            ('resource_id', resource.id),
            ('resource_url', archived_resource.url),
            ('url_up_to_date', resource.url == archived_resource.url),
            ('via', via),
            ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
            ('last_updated', archival.updated.isoformat() if archival.updated else None),
            ('last_success', archival.last_success.isoformat() if archival.last_success else None),
            ('url_redirected_to', archival.url_redirected_to),
            ('reason', archival.reason),
            ('status', archival.status),
            ('failure_count', archival.failure_count),
            ))

        results.append(row_data)

    num_broken_packages = archivals.distinct(model.Package.name).count()
    num_broken_resources = len(results)

    # Get total number of packages & resources
    num_packages = model.Session.query(model.Package)\
                        .filter(model.Package.owner_org.in_(org_ids))\
                        .filter_by(state='active')\
                        .count()
    num_resources = model.Session.query(model.Resource)\
                         .filter_by(state='active')
    if p.toolkit.check_ckan_version(max_version='2.2.99'):
        num_resources = num_resources.join(model.ResourceGroup)
    num_resources = num_resources \
        .join(model.Package)\
        .filter(model.Package.owner_org.in_(org_ids))\
        .filter_by(state='active').count()

    return {'organization_name': name,
            'organization_title': title,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            'table': results}