def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter( model.PackageTag.id == None # noqa: E711 ) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [ OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100) ] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent(len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, # noqa {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, # noqa ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter(model.PackageTag.id == None) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100)] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent( len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def broken_links_index(include_sub_organizations=False): '''Returns the count of broken links for all organizations.''' from ckanext.archiver.model import Archival counts = {} # Get all the broken datasets and build up the results by org orgs = model.Session.query(model.Group)\ .filter(model.Group.type == 'organization')\ .filter(model.Group.state == 'active').all() for org in add_progress_bar( orgs, 'Part 1/2' if include_sub_organizations else None): archivals = model.Session.query(Archival)\ .filter(Archival.is_broken == True)\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.owner_org == org.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active') broken_resources = archivals.count() broken_datasets = archivals.distinct(model.Package.id).count() num_datasets = model.Session.query(model.Package)\ .filter_by(owner_org=org.id)\ .filter_by(state='active')\ .count() num_resources = model.Session.query(model.Package)\ .filter_by(owner_org=org.id)\ .filter_by(state='active') if p.toolkit.check_ckan_version(max_version='2.2.99'): num_resources = num_resources.join(model.ResourceGroup) num_resources = num_resources \ .join(model.Resource)\ .filter_by(state='active')\ .count() counts[org.name] = { 'organization_title': org.title, 'broken_packages': broken_datasets, 'broken_resources': broken_resources, 'packages': num_datasets, 'resources': num_resources } counts_with_sub_orgs = copy.deepcopy(counts) # new dict if include_sub_organizations: for org_name in add_progress_bar(counts_with_sub_orgs, 'Part 2/2'): org = model.Group.by_name(org_name) for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \ in org.get_children_group_hierarchy(type='organization'): if sub_org_name not in counts: # occurs only if there is an organization created since the last loop? continue counts_with_sub_orgs[org_name]['broken_packages'] += \ counts[sub_org_name]['broken_packages'] counts_with_sub_orgs[org_name]['broken_resources'] += \ counts[sub_org_name]['broken_resources'] counts_with_sub_orgs[org_name]['packages'] += \ counts[sub_org_name]['packages'] counts_with_sub_orgs[org_name]['resources'] += \ counts[sub_org_name]['resources'] results = counts_with_sub_orgs else: results = counts data = [] num_broken_packages = 0 num_broken_resources = 0 num_packages = 0 num_resources = 0 for org_name, org_counts in results.iteritems(): data.append(OrderedDict(( ('organization_title', results[org_name]['organization_title']), ('organization_name', org_name), ('package_count', org_counts['packages']), ('resource_count', org_counts['resources']), ('broken_package_count', org_counts['broken_packages']), ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])), ('broken_resource_count', org_counts['broken_resources']), ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])), ))) # Totals - always use the counts, rather than counts_with_sub_orgs, to # avoid counting a package in both its org and parent org org_counts_ = counts[org_name] num_broken_packages += org_counts_['broken_packages'] num_broken_resources += org_counts_['broken_resources'] num_packages += org_counts_['packages'] num_resources += org_counts_['resources'] data.sort(key=lambda x: (-x['broken_package_count'], -x['broken_resource_count'])) return {'table': data, 'num_broken_packages': num_broken_packages, 'num_broken_resources': num_broken_resources, 'num_packages': num_packages, 'num_resources': num_resources, 'broken_package_percent': lib.percent(num_broken_packages, num_packages), 'broken_resource_percent': lib.percent(num_broken_resources, num_resources), }
def broken_links_for_organization(organization, include_sub_organizations=False): ''' Returns a dictionary detailing broken resource links for the organization or if organization it returns the index page for all organizations. params: organization - name of an organization Returns: {'organization_name': 'cabinet-office', 'organization_title:': 'Cabinet Office', 'table': [ {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'} ...] ''' from ckanext.archiver.model import Archival org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound() name = org.name title = org.title archivals = model.Session.query(Archival, model.Package, model.Group).\ filter(Archival.is_broken == True).\ join(model.Package, Archival.package_id == model.Package.id).\ filter(model.Package.state == 'active').\ join(model.Resource, Archival.resource_id == model.Resource.id).\ filter(model.Resource.state == 'active') if not include_sub_organizations: org_ids = [org.id] archivals = archivals.filter(model.Package.owner_org == org.id) else: # We want any organization_id that is part of this organization's tree org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)] archivals = archivals.filter(model.Package.owner_org.in_(org_ids)) archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id) results = [] for archival, pkg, org in archivals.all(): pkg = model.Package.get(archival.package_id) resource = model.Resource.get(archival.resource_id) via = '' er = pkg.extras.get('external_reference', '') if er == 'ONSHUB': via = "Stats Hub" elif er.startswith("DATA4NR"): via = "Data4nr" archived_resource = model.Session.query(model.ResourceRevision)\ .filter_by(id=resource.id)\ .filter_by(revision_timestamp=archival.resource_timestamp)\ .first() or resource row_data = OrderedDict(( ('dataset_title', pkg.title), ('dataset_name', pkg.name), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_title', org.title), ('organization_name', org.name), ('resource_position', resource.position), ('resource_id', resource.id), ('resource_url', archived_resource.url), ('url_up_to_date', resource.url == archived_resource.url), ('via', via), ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None), ('last_updated', archival.updated.isoformat() if archival.updated else None), ('last_success', archival.last_success.isoformat() if archival.last_success else None), ('url_redirected_to', archival.url_redirected_to), ('reason', archival.reason), ('status', archival.status), ('failure_count', archival.failure_count), )) results.append(row_data) num_broken_packages = archivals.distinct(model.Package.name).count() num_broken_resources = len(results) # Get total number of packages & resources num_packages = model.Session.query(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active')\ .count() num_resources = model.Session.query(model.Resource)\ .filter_by(state='active') if p.toolkit.check_ckan_version(max_version='2.2.99'): num_resources = num_resources.join(model.ResourceGroup) num_resources = num_resources \ .join(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active').count() return {'organization_name': name, 'organization_title': title, 'num_broken_packages': num_broken_packages, 'num_broken_resources': num_broken_resources, 'num_packages': num_packages, 'num_resources': num_resources, 'broken_package_percent': lib.percent(num_broken_packages, num_packages), 'broken_resource_percent': lib.percent(num_broken_resources, num_resources), 'table': results}