Exemplo n.º 1
0
def _create_category_graph() -> nx.DiGraph:
    skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE))
    skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER)
    skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c]
    wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c]
    graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges)
    graph.add_nodes_from(skos_nodes)

    # identify maintenance categories
    invalid_parent_categories = [
        'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories',
        'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities',
        'Wikipedia non-empty soft redirected categories'
    ]
    invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))}
    # identify any remaining invalid categories (maintenance categories etc) using indicator tokens
    ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories')
    maintenance_category_indicators = {
        'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects',
        'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational'
    }
    for cat in graph:
        cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')}
        if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators):
            invalid_categories.add(cat)
    invalid_categories.update(set(graph.nodes).difference(skos_nodes))  # only keep categories mentioned in skos
    invalid_categories.discard(utils.get_config('category.root_category'))  # make sure to keep root node
    graph.remove_nodes_from(invalid_categories)
    return graph
Exemplo n.º 2
0
def get_topics(category: str) -> set:
    """Return the topics for the given category."""
    global __TOPICS__
    if '__TOPICS__' not in globals():
        __TOPICS__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.topical_concepts')], rdf_util.PREDICATE_SUBJECT)

    return __TOPICS__[category]
Exemplo n.º 3
0
def get_disjoint_types(dbp_type: str) -> set:
    """Return all types that are disjoint with `dbp_type` (excluding the wrong disjointness Agent<->Place)."""
    global __DISJOINT_TYPE_MAPPING__
    if '__DISJOINT_TYPE_MAPPING__' not in globals():
        __DISJOINT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf(
            [util.get_data_file('files.dbpedia.taxonomy')],
            rdf_util.PREDICATE_DISJOINT_WITH,
            reflexive=True)
        # add/remove custom axioms
        __DISJOINT_TYPE_MAPPING__ = defaultdict(
            set, {
                k: {
                    v
                    for v in values
                    if {k, v} not in REMOVED_DISJOINTNESS_AXIOMS
                }
                for k, values in __DISJOINT_TYPE_MAPPING__.items()
            })
        for a, b in ADDED_DISJOINTNESS_AXIOMS:
            __DISJOINT_TYPE_MAPPING__[a].add(b)
            __DISJOINT_TYPE_MAPPING__[b].add(a)

        # completing the subtype of each type with the subtypes of its disjoint types
        __DISJOINT_TYPE_MAPPING__ = defaultdict(
            set, {
                t: {
                    st
                    for dt in disjoint_types
                    for st in get_transitive_subtypes(dt)
                }
                for t, disjoint_types in __DISJOINT_TYPE_MAPPING__.items()
            })

    return __DISJOINT_TYPE_MAPPING__[dbp_type]
Exemplo n.º 4
0
def get_equivalent_types(dbp_type: str) -> set:
    """Return the set of equivalent types to the given type (including itself)."""
    global __EQUIVALENT_TYPE_MAPPING__
    if '__EQUIVALENT_TYPE_MAPPING__' not in globals():
        __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True)
        # remove external types from equivalent mappings as they are prone to errors
        __EQUIVALENT_TYPE_MAPPING__ = defaultdict(set, {t: {et for et in __EQUIVALENT_TYPE_MAPPING__[t] if dbp_util.is_dbp_type(et) or et == rdf_util.CLASS_OWL_THING} for t in __EQUIVALENT_TYPE_MAPPING__ if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING})
    return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
Exemplo n.º 5
0
def get_resources(category: str) -> set:
    """Return all resources of the given category."""
    global __CATEGORY_RESOURCES__
    if '__CATEGORY_RESOURCES__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT, reverse_key=True)
        __CATEGORY_RESOURCES__ = utils.load_or_create_cache('dbpedia_category_resources', initializer)

    return __CATEGORY_RESOURCES__[category]
Exemplo n.º 6
0
def get_resource_categories(dbp_resource: str) -> set:
    """Return all categories the given resource is contained in."""
    global __RESOURCE_CATEGORIES__
    if '__RESOURCE_CATEGORIES__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT)
        __RESOURCE_CATEGORIES__ = utils.load_or_create_cache('dbpedia_resource_categories', initializer)

    return __RESOURCE_CATEGORIES__[dbp_resource]
Exemplo n.º 7
0
def get_equivalent_types(dbp_type: str) -> set:
    global __EQUIVALENT_TYPE_MAPPING__
    if '__EQUIVALENT_TYPE_MAPPING__' not in globals():
        __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf(
            [util.get_data_file('files.dbpedia.taxonomy')],
            rdf_util.PREDICATE_EQUIVALENT_CLASS,
            reflexive=True)

    return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
Exemplo n.º 8
0
def get_children(category: str) -> set:
    global __CHILDREN__
    if '__CHILDREN__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf(
            [util.get_data_file('files.dbpedia.categories')],
            rdf_util.PREDICATE_BROADER,
            reverse_key=True)
        __CHILDREN__ = util.load_or_create_cache('dbpedia_category_children',
                                                 initializer)

    return __CHILDREN__[category].difference({category})
Exemplo n.º 9
0
def _get_resource_type_mapping() -> dict:
    global __RESOURCE_TYPE_MAPPING__
    if '__RESOURCE_TYPE_MAPPING__' not in globals():
        type_files = [
            util.get_data_file('files.dbpedia.instance_types'),
            util.get_data_file('files.dbpedia.transitive_instance_types'),
        ]
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf(
            type_files, rdf_util.PREDICATE_TYPE)
        __RESOURCE_TYPE_MAPPING__ = util.load_or_create_cache(
            'dbpedia_resource_type_mapping', initializer)

    return __RESOURCE_TYPE_MAPPING__
Exemplo n.º 10
0
def _get_type_graph() -> nx.DiGraph:
    """Return the initialised graph of DBpedia types."""
    global __TYPE_GRAPH__
    if '__TYPE_GRAPH__' not in globals():
        subtype_mapping = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, reverse_key=True)
        # add missing types (i.e. those, that do not have subclasses at all)
        all_types = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_CLASS)
        subtype_mapping.update({et: set() for t in all_types for et in get_equivalent_types(t) if et not in subtype_mapping})
        # completing subtypes with subtypes of equivalent types
        subtype_mapping = {t: {est for et in get_equivalent_types(t) for st in subtype_mapping[et] for est in get_equivalent_types(st)} for t in set(subtype_mapping)}
        # remove non-dbpedia types from ontology
        subtype_mapping = {t: {st for st in sts if dbp_util.is_dbp_type(st) or st == rdf_util.CLASS_OWL_THING} for t, sts in subtype_mapping.items() if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING}
        __TYPE_GRAPH__ = nx.DiGraph(incoming_graph_data=[(t, st) for t, sts in subtype_mapping.items() for st in sts])
    return __TYPE_GRAPH__
Exemplo n.º 11
0
def get_equivalent_predicates(dbp_predicate: str) -> set:
    """Return all equivalent predicates of a given predicate."""
    global __EQUIVALENT_PREDICATE__
    if '__EQUIVALENT_PREDICATE__' not in globals():
        __EQUIVALENT_PREDICATE__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_PROPERTY)
    return __EQUIVALENT_PREDICATE__[dbp_predicate]
Exemplo n.º 12
0
def get_disambiguation_mapping() -> dict:
    global __DISAMBIGUATIONS__
    if '__DISAMBIGUATIONS__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.disambiguations')], rdf_util.PREDICATE_DISAMBIGUATES)
        __DISAMBIGUATIONS__ = defaultdict(set, utils.load_or_create_cache('dbpedia_resource_disambiguations', initializer))
    return __DISAMBIGUATIONS__