def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package2, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0]['name'] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]['name']: print stats.add('Theme unchanged %s' % themes[0]['name'], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0]['name'], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def suggest_themes(context, data_dict): '''Suggests themes for a dataset or the component parts of a dataset To be able to determine the primary and secondary theme, the description tags and title are required for a Package. The categorize_package function requires works with Package models and a dictionary, so both versions are supported. If an id is passed, then the package will be retrieved and passed to the categorisation, otherwise it will be formatted as per the required dictionary. ''' from ckanext.dgu.lib.theme import categorize_package2 themes = [] # TODO: Make this only available to logged in publishers model = context['model'] id = data_dict.get('id') if id: pkg = model.Package.get(id) themes = categorize_package2(pkg) else: pkg_dict = { 'name': data_dict.get('name'), 'title': data_dict.get('title'), 'notes': data_dict.get('notes'), 'tags': [t for t in data_dict.get('tags', '').split(',')], 'extras': [{ 'key': '', 'value': '' }] } themes = categorize_package2(pkg_dict) results = {'primary-theme': {}, 'secondary-theme': []} if len(themes) >= 1: results['primary-theme'] = themes[0] results['secondary-theme'] = themes[1:] return results
def test_basic(self): themes = categorize_package2(fish_pkg) assert_equal(type(themes), list) theme = themes[0] assert_equal(type(theme), dict) assert_equal(theme["name"], "Environment") # be lenient as the algorithm may change assert theme["score"] > 3, theme.get("score") assert theme["reasons"], theme.get("reasons") assert_equal( [u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme["reasons"] )
def test_basic(self): themes = categorize_package2(fish_pkg) assert_equal(type(themes), list) theme = themes[0] assert_equal(type(theme), dict) assert_equal(theme['name'], 'Environment') # be lenient as the algorithm may change assert theme['score'] > 3, theme.get('score') assert theme['reasons'], theme.get('reasons') assert_equal([u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme['reasons'])
def suggest_themes(context, data_dict): '''Suggests themes for a dataset or the component parts of a dataset To be able to determine the primary and secondary theme, the description tags and title are required for a Package. The categorize_package function requires works with Package models and a dictionary, so both versions are supported. If an id is passed, then the package will be retrieved and passed to the categorisation, otherwise it will be formatted as per the required dictionary. ''' from ckanext.dgu.lib.theme import categorize_package2 themes = [] # TODO: Make this only available to logged in publishers model = context['model'] id = data_dict.get('id') if id: pkg = model.Package.get(id) themes = categorize_package2(pkg) else: pkg_dict = {'name': data_dict.get('name'), 'title': data_dict.get('title'), 'notes': data_dict.get('notes'), 'tags': [t for t in data_dict.get('tags', '').split(',')], 'extras': [{'key': '', 'value': ''}] } themes = categorize_package2(pkg_dict) results = {'primary-theme': {}, 'secondary-theme': []} if len(themes) >= 1: results['primary-theme'] = themes[0] results['secondary-theme'] = themes[1:] return results
def test_with_secondary_theme(self): themes = categorize_package2(fish_and_spend_pkg) assert_equal(type(themes), list) theme = themes[0] assert_equal(type(theme), dict) assert_equal(theme["name"], "Environment") # be lenient as the algorithm may change assert theme["score"] > 3, theme.get("score") assert theme["reasons"], theme.get("reasons") assert_equal( [u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme["reasons"] ) theme = themes[1] assert_equal(theme["name"], "Government Spending") assert_equal( [u'"spend" matched title', u'"transact" matched description', u'"spend" matched description'], theme["reasons"], )
def test_with_secondary_theme(self): themes = categorize_package2(fish_and_spend_pkg) assert_equal(type(themes), list) theme = themes[0] assert_equal(type(theme), dict) assert_equal(theme['name'], 'Environment') # be lenient as the algorithm may change assert theme['score'] > 3, theme.get('score') assert theme['reasons'], theme.get('reasons') assert_equal([u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme['reasons']) theme = themes[1] assert_equal(theme['name'], 'Government Spending') assert_equal([u'"spend" matched title', u'"transact" matched description', u'"spend" matched description'], theme['reasons'])
def test_stem_exception(self): themes = categorize_package2(employer_pkg) theme_names = [theme["name"] for theme in themes] assert_equal(set(("Business & Economy",)), set(theme_names))
def test_topic_in_two_categories(self): themes = categorize_package2(death_pkg) theme_names = [theme["name"] for theme in themes] assert_equal(set(("Society", "Health")), set(theme_names))
def test_topic_in_two_categories(self): themes = categorize_package2(death_pkg) theme_names = [theme['name'] for theme in themes] assert_equal(set(('Society', 'Health')), set(theme_names))
def test_stem_exception(self): themes = categorize_package2(employer_pkg) theme_names = [theme['name'] for theme in themes] assert_equal(set(('Business & Economy',)), set(theme_names))