Пример #1
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package2, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package2(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Пример #2
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME,
                                       SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (
                theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package2(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0]['name'] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]['name']:
            print stats.add('Theme unchanged %s' % themes[0]['name'],
                            pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0]['name'],
                        pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Пример #3
0
def suggest_themes(context, data_dict):
    '''Suggests themes for a dataset or the component parts of a dataset

    To be able to determine the primary and secondary theme, the description
    tags and title are required for a Package. The categorize_package function
    requires works with Package models and a dictionary, so both versions are
    supported.  If an id is passed, then the package will be retrieved and passed
    to the categorisation, otherwise it will be formatted as per the required
    dictionary.
    '''
    from ckanext.dgu.lib.theme import categorize_package2
    themes = []

    # TODO: Make this only available to logged in publishers

    model = context['model']

    id = data_dict.get('id')
    if id:
        pkg = model.Package.get(id)
        themes = categorize_package2(pkg)
    else:
        pkg_dict = {
            'name': data_dict.get('name'),
            'title': data_dict.get('title'),
            'notes': data_dict.get('notes'),
            'tags': [t for t in data_dict.get('tags', '').split(',')],
            'extras': [{
                'key': '',
                'value': ''
            }]
        }
        themes = categorize_package2(pkg_dict)

    results = {'primary-theme': {}, 'secondary-theme': []}
    if len(themes) >= 1:
        results['primary-theme'] = themes[0]

    results['secondary-theme'] = themes[1:]

    return results
Пример #4
0
    def test_basic(self):
        themes = categorize_package2(fish_pkg)

        assert_equal(type(themes), list)
        theme = themes[0]
        assert_equal(type(theme), dict)
        assert_equal(theme["name"], "Environment")
        # be lenient as the algorithm may change
        assert theme["score"] > 3, theme.get("score")
        assert theme["reasons"], theme.get("reasons")
        assert_equal(
            [u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme["reasons"]
        )
Пример #5
0
    def test_basic(self):
        themes = categorize_package2(fish_pkg)

        assert_equal(type(themes), list)
        theme = themes[0]
        assert_equal(type(theme), dict)
        assert_equal(theme['name'], 'Environment')
        # be lenient as the algorithm may change
        assert theme['score'] > 3, theme.get('score')
        assert theme['reasons'], theme.get('reasons')
        assert_equal([u'"fish" matched title',
                      u'"river" matched title',
                      u'"fish" matched description'],
                     theme['reasons'])
Пример #6
0
def suggest_themes(context, data_dict):
    '''Suggests themes for a dataset or the component parts of a dataset

    To be able to determine the primary and secondary theme, the description
    tags and title are required for a Package. The categorize_package function
    requires works with Package models and a dictionary, so both versions are
    supported.  If an id is passed, then the package will be retrieved and passed
    to the categorisation, otherwise it will be formatted as per the required
    dictionary.
    '''
    from ckanext.dgu.lib.theme import categorize_package2
    themes = []

    # TODO: Make this only available to logged in publishers

    model = context['model']

    id = data_dict.get('id')
    if id:
        pkg = model.Package.get(id)
        themes = categorize_package2(pkg)
    else:
        pkg_dict = {'name': data_dict.get('name'),
                    'title': data_dict.get('title'),
                    'notes': data_dict.get('notes'),
                    'tags': [t for t in data_dict.get('tags', '').split(',')],
                    'extras': [{'key': '', 'value': ''}]
                    }
        themes = categorize_package2(pkg_dict)

    results = {'primary-theme': {}, 'secondary-theme': []}
    if len(themes) >= 1:
        results['primary-theme'] = themes[0]

    results['secondary-theme'] = themes[1:]

    return results
Пример #7
0
    def test_with_secondary_theme(self):
        themes = categorize_package2(fish_and_spend_pkg)

        assert_equal(type(themes), list)
        theme = themes[0]
        assert_equal(type(theme), dict)
        assert_equal(theme["name"], "Environment")
        # be lenient as the algorithm may change
        assert theme["score"] > 3, theme.get("score")
        assert theme["reasons"], theme.get("reasons")
        assert_equal(
            [u'"fish" matched title', u'"river" matched title', u'"fish" matched description'], theme["reasons"]
        )

        theme = themes[1]
        assert_equal(theme["name"], "Government Spending")
        assert_equal(
            [u'"spend" matched title', u'"transact" matched description', u'"spend" matched description'],
            theme["reasons"],
        )
Пример #8
0
    def test_with_secondary_theme(self):
        themes = categorize_package2(fish_and_spend_pkg)

        assert_equal(type(themes), list)
        theme = themes[0]
        assert_equal(type(theme), dict)
        assert_equal(theme['name'], 'Environment')
        # be lenient as the algorithm may change
        assert theme['score'] > 3, theme.get('score')
        assert theme['reasons'], theme.get('reasons')
        assert_equal([u'"fish" matched title',
                      u'"river" matched title',
                      u'"fish" matched description'],
                     theme['reasons'])

        theme = themes[1]
        assert_equal(theme['name'], 'Government Spending')
        assert_equal([u'"spend" matched title',
                      u'"transact" matched description',
                      u'"spend" matched description'],
                     theme['reasons'])
Пример #9
0
 def test_stem_exception(self):
     themes = categorize_package2(employer_pkg)
     theme_names = [theme["name"] for theme in themes]
     assert_equal(set(("Business & Economy",)), set(theme_names))
Пример #10
0
 def test_topic_in_two_categories(self):
     themes = categorize_package2(death_pkg)
     theme_names = [theme["name"] for theme in themes]
     assert_equal(set(("Society", "Health")), set(theme_names))
Пример #11
0
 def test_topic_in_two_categories(self):
     themes = categorize_package2(death_pkg)
     theme_names = [theme['name'] for theme in themes]
     assert_equal(set(('Society', 'Health')), set(theme_names))
Пример #12
0
 def test_stem_exception(self):
     themes = categorize_package2(employer_pkg)
     theme_names = [theme['name'] for theme in themes]
     assert_equal(set(('Business & Economy',)), set(theme_names))