def _create_category_graph() -> nx.DiGraph: skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE)) skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER) skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c] wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c] graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges) graph.add_nodes_from(skos_nodes) # identify maintenance categories invalid_parent_categories = [ 'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories', 'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities', 'Wikipedia non-empty soft redirected categories' ] invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))} # identify any remaining invalid categories (maintenance categories etc) using indicator tokens ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories') maintenance_category_indicators = { 'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects', 'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational' } for cat in graph: cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')} if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators): invalid_categories.add(cat) invalid_categories.update(set(graph.nodes).difference(skos_nodes)) # only keep categories mentioned in skos invalid_categories.discard(utils.get_config('category.root_category')) # make sure to keep root node graph.remove_nodes_from(invalid_categories) return graph
def get_topics(category: str) -> set: """Return the topics for the given category.""" global __TOPICS__ if '__TOPICS__' not in globals(): __TOPICS__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.topical_concepts')], rdf_util.PREDICATE_SUBJECT) return __TOPICS__[category]
def get_disjoint_types(dbp_type: str) -> set: """Return all types that are disjoint with `dbp_type` (excluding the wrong disjointness Agent<->Place).""" global __DISJOINT_TYPE_MAPPING__ if '__DISJOINT_TYPE_MAPPING__' not in globals(): __DISJOINT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_DISJOINT_WITH, reflexive=True) # add/remove custom axioms __DISJOINT_TYPE_MAPPING__ = defaultdict( set, { k: { v for v in values if {k, v} not in REMOVED_DISJOINTNESS_AXIOMS } for k, values in __DISJOINT_TYPE_MAPPING__.items() }) for a, b in ADDED_DISJOINTNESS_AXIOMS: __DISJOINT_TYPE_MAPPING__[a].add(b) __DISJOINT_TYPE_MAPPING__[b].add(a) # completing the subtype of each type with the subtypes of its disjoint types __DISJOINT_TYPE_MAPPING__ = defaultdict( set, { t: { st for dt in disjoint_types for st in get_transitive_subtypes(dt) } for t, disjoint_types in __DISJOINT_TYPE_MAPPING__.items() }) return __DISJOINT_TYPE_MAPPING__[dbp_type]
def get_equivalent_types(dbp_type: str) -> set: """Return the set of equivalent types to the given type (including itself).""" global __EQUIVALENT_TYPE_MAPPING__ if '__EQUIVALENT_TYPE_MAPPING__' not in globals(): __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True) # remove external types from equivalent mappings as they are prone to errors __EQUIVALENT_TYPE_MAPPING__ = defaultdict(set, {t: {et for et in __EQUIVALENT_TYPE_MAPPING__[t] if dbp_util.is_dbp_type(et) or et == rdf_util.CLASS_OWL_THING} for t in __EQUIVALENT_TYPE_MAPPING__ if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING}) return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
def get_resources(category: str) -> set: """Return all resources of the given category.""" global __CATEGORY_RESOURCES__ if '__CATEGORY_RESOURCES__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT, reverse_key=True) __CATEGORY_RESOURCES__ = utils.load_or_create_cache('dbpedia_category_resources', initializer) return __CATEGORY_RESOURCES__[category]
def get_resource_categories(dbp_resource: str) -> set: """Return all categories the given resource is contained in.""" global __RESOURCE_CATEGORIES__ if '__RESOURCE_CATEGORIES__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT) __RESOURCE_CATEGORIES__ = utils.load_or_create_cache('dbpedia_resource_categories', initializer) return __RESOURCE_CATEGORIES__[dbp_resource]
def get_equivalent_types(dbp_type: str) -> set: global __EQUIVALENT_TYPE_MAPPING__ if '__EQUIVALENT_TYPE_MAPPING__' not in globals(): __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True) return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
def get_children(category: str) -> set: global __CHILDREN__ if '__CHILDREN__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.categories')], rdf_util.PREDICATE_BROADER, reverse_key=True) __CHILDREN__ = util.load_or_create_cache('dbpedia_category_children', initializer) return __CHILDREN__[category].difference({category})
def _get_resource_type_mapping() -> dict: global __RESOURCE_TYPE_MAPPING__ if '__RESOURCE_TYPE_MAPPING__' not in globals(): type_files = [ util.get_data_file('files.dbpedia.instance_types'), util.get_data_file('files.dbpedia.transitive_instance_types'), ] initializer = lambda: rdf_util.create_multi_val_dict_from_rdf( type_files, rdf_util.PREDICATE_TYPE) __RESOURCE_TYPE_MAPPING__ = util.load_or_create_cache( 'dbpedia_resource_type_mapping', initializer) return __RESOURCE_TYPE_MAPPING__
def _get_type_graph() -> nx.DiGraph: """Return the initialised graph of DBpedia types.""" global __TYPE_GRAPH__ if '__TYPE_GRAPH__' not in globals(): subtype_mapping = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, reverse_key=True) # add missing types (i.e. those, that do not have subclasses at all) all_types = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_CLASS) subtype_mapping.update({et: set() for t in all_types for et in get_equivalent_types(t) if et not in subtype_mapping}) # completing subtypes with subtypes of equivalent types subtype_mapping = {t: {est for et in get_equivalent_types(t) for st in subtype_mapping[et] for est in get_equivalent_types(st)} for t in set(subtype_mapping)} # remove non-dbpedia types from ontology subtype_mapping = {t: {st for st in sts if dbp_util.is_dbp_type(st) or st == rdf_util.CLASS_OWL_THING} for t, sts in subtype_mapping.items() if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING} __TYPE_GRAPH__ = nx.DiGraph(incoming_graph_data=[(t, st) for t, sts in subtype_mapping.items() for st in sts]) return __TYPE_GRAPH__
def get_equivalent_predicates(dbp_predicate: str) -> set: """Return all equivalent predicates of a given predicate.""" global __EQUIVALENT_PREDICATE__ if '__EQUIVALENT_PREDICATE__' not in globals(): __EQUIVALENT_PREDICATE__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_PROPERTY) return __EQUIVALENT_PREDICATE__[dbp_predicate]
def get_disambiguation_mapping() -> dict: global __DISAMBIGUATIONS__ if '__DISAMBIGUATIONS__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.disambiguations')], rdf_util.PREDICATE_DISAMBIGUATES) __DISAMBIGUATIONS__ = defaultdict(set, utils.load_or_create_cache('dbpedia_resource_disambiguations', initializer)) return __DISAMBIGUATIONS__