def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]'
def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]'
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name( self.munge_title_to_name('%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' import ckanext.dgu.lib.theme as dgutheme from ckan.lib.helpers import resource_formats from ckan import model from ckanext.harvest.model import (HarvestObjectExtra as HOExtra, HarvestGatherError) res_formats = resource_formats() inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: license_id, licence = \ dgu_helpers.get_licence_fields_from_free_text(rights) pkg['license_id'] = license_id if licence: pkg['extras']['licence'] = licence log.info('Custom licence %r', rights) else: pkg['license_id'] = '' # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = res_formats.get(inv_resource['mimetype'].lower().strip()) if format_: format_ = format_[1] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self._gen_new_name('%s %s' % (pkg['title'], publisher_abbrev)) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] try: themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) except ImportError, e: log.debug('Theme cannot be given: %s', e) themes = []
def test_with_secondary_theme(self): themes = categorize_package(fish_and_spend_pkg) assert_equal(themes, ["Environment", "Government Spending"])
def test_basic(self): themes = categorize_package(fish_pkg) assert_equal(themes, ["Environment"])
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content) ) pkg = dict( title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid } ) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = {'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name(self.munge_title_to_name( '%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def test_with_secondary_theme(self): themes = categorize_package(fish_and_spend_pkg) assert_equal(themes, ['Environment', 'Government Spending'])
def test_basic(self): themes = categorize_package(fish_pkg) assert_equal(themes, ['Environment'])
def record_2_package(self, item): assert isinstance(item, dict) # process item title, release = self._split_title(item['title']) munged_title = schema.name_munge(title) publisher_id = self._source_to_publisher(item['hub:source-agency']) if not publisher_id: log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency']) # Resources guid = item['guid'] or None if guid: if not guid.startswith(guid_prefix): raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid)) guid = guid[len(guid_prefix):] if 'http' in guid: raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid)) existing_resource = None download_url = item.get('link', None) notes_list = [] if item['description']: notes_list.append(item['description']) for column, name in [('hub:source-agency', 'Source agency'), ('hub:designation', 'Designation'), ('hub:language', 'Language'), ('hub:altTitle', 'Alternative title'), ]: if item[column]: notes_list.append('%s: %s' % (name, item[column])) notes = '\n\n'.join(notes_list) extras = { 'geographic_coverage': u'', 'external_reference': u'', 'temporal_granularity': u'', 'date_updated': u'', 'precision': u'', 'geographic_granularity': u'', 'temporal_coverage-from': u'', 'temporal_coverage-to': u'', 'national_statistic': u'', 'update_frequency': u'', 'date_released': u'', 'categories': u'', 'series':u'', } date_released = u'' if item['pubDate']: date_released = date.parse(item["pubDate"]) if date_released.qualifier: log.warn('Could not read format of publication (release) date: %r' % item["pubDate"]) extras['date_released'] = date_released.isoformat() extras['categories'] = item['hub:theme'] extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage']) extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no' extras['geographic_granularity'] = item['hub:geographic-breakdown'] extras['external_reference'] = u'ONSHUB' extras['series'] = title if release else u'' for update_frequency_suggestion in schema.update_frequency_options: item_info = ('%s %s' % (item['title'], item['description'])).lower() if update_frequency_suggestion in item_info: extras['update_frequency'] = update_frequency_suggestion elif update_frequency_suggestion.endswith('ly'): if update_frequency_suggestion.rstrip('ly') in item_info: extras['update_frequency'] = update_frequency_suggestion extras['import_source'] = 'ONS-%s' % self._current_filename resources = [{ 'url': download_url, 'description': release, 'hub-id': guid, 'publish-date': date_released.as_datetime().strftime('%Y-%m-%d'), }] # update package pkg_dict = { 'name': munged_title, 'title': title, 'version': None, 'url': None, 'maintainer': None, 'maintainer_email': None, 'notes': notes, 'license_id': self._crown_license_id, 'tags': [], # post-filled 'owner_org': publisher_id, 'resources': resources, 'extras': extras, } tags = schema.TagSuggester.suggest_tags(pkg_dict) for keyword in item['hub:ipsv'].split(';') + \ self._split_keywords(item['hub:keywords']) + \ item['hub:nscl'].split(';'): tag = schema.tag_munge(keyword) if tag and len(tag) > 1: tags.add(tag) tags = list(tags) tags.sort() pkg_dict['tags'] = tags themes = categorize_package(pkg_dict) log.debug('%s given themes: %r', munged_title, themes) if themes: pkg_dict['extras'][PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg_dict['extras'][SECONDARY_THEMES] = '["%s"]' % themes[1] return pkg_dict