Пример #1
0
def _create_category_graph() -> nx.DiGraph:
    skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE))
    skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER)
    skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c]
    wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c]
    graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges)
    graph.add_nodes_from(skos_nodes)

    # identify maintenance categories
    invalid_parent_categories = [
        'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories',
        'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities',
        'Wikipedia non-empty soft redirected categories'
    ]
    invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))}
    # identify any remaining invalid categories (maintenance categories etc) using indicator tokens
    ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories')
    maintenance_category_indicators = {
        'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects',
        'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational'
    }
    for cat in graph:
        cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')}
        if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators):
            invalid_categories.add(cat)
    invalid_categories.update(set(graph.nodes).difference(skos_nodes))  # only keep categories mentioned in skos
    invalid_categories.discard(utils.get_config('category.root_category'))  # make sure to keep root node
    graph.remove_nodes_from(invalid_categories)
    return graph
Пример #2
0
def _get_type_graph() -> nx.DiGraph:
    """Return the initialised graph of DBpedia types."""
    global __TYPE_GRAPH__
    if '__TYPE_GRAPH__' not in globals():
        subtype_mapping = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, reverse_key=True)
        # add missing types (i.e. those, that do not have subclasses at all)
        all_types = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_CLASS)
        subtype_mapping.update({et: set() for t in all_types for et in get_equivalent_types(t) if et not in subtype_mapping})
        # completing subtypes with subtypes of equivalent types
        subtype_mapping = {t: {est for et in get_equivalent_types(t) for st in subtype_mapping[et] for est in get_equivalent_types(st)} for t in set(subtype_mapping)}
        # remove non-dbpedia types from ontology
        subtype_mapping = {t: {st for st in sts if dbp_util.is_dbp_type(st) or st == rdf_util.CLASS_OWL_THING} for t, sts in subtype_mapping.items() if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING}
        __TYPE_GRAPH__ = nx.DiGraph(incoming_graph_data=[(t, st) for t, sts in subtype_mapping.items() for st in sts])
    return __TYPE_GRAPH__
Пример #3
0
def get_inverse_resource_property_mapping() -> dict:
    """Return a mapping from DBpedia resources to a dict containing property-value assignments (containing inverted facts of DBpedia)."""
    global __INVERSE_RESOURCE_PROPERTY_MAPPING__
    if '__INVERSE_RESOURCE_PROPERTY_MAPPING__' not in globals():
        initializer = lambda: rdf_util.create_dict_from_rdf([utils.get_data_file('files.dbpedia.mappingbased_objects')], reverse_key=True)
        __INVERSE_RESOURCE_PROPERTY_MAPPING__ = utils.load_or_create_cache('dbpedia_inverse_resource_properties', initializer)
    return __INVERSE_RESOURCE_PROPERTY_MAPPING__
Пример #4
0
def get_label(dbp_object: str) -> str:
    """Return the label of a DBpedia resource or type."""
    global __RESOURCE_LABELS__
    if '__RESOURCE_LABELS__' not in globals():
        __RESOURCE_LABELS__ = dict(_get_label_mapping())
        __RESOURCE_LABELS__.update(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL))
    return __RESOURCE_LABELS__[dbp_object] if dbp_object in __RESOURCE_LABELS__ else dbp_util.object2name(dbp_object)
Пример #5
0
def get_label_category(label: str) -> str:
    """Return the category that fits the given label best."""
    global __INVERSE_CATEGORY_LABELS__
    if '__INVERSE_CATEGORY_LABELS__' not in globals():
        labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL)
        __INVERSE_CATEGORY_LABELS__ = {v: k for k, v in labels.items()}
    return __INVERSE_CATEGORY_LABELS__[label] if label in __INVERSE_CATEGORY_LABELS__ else cat_util.name2category(label)
Пример #6
0
def get_label(category: str) -> str:
    """Return the label for the given category."""
    global __CATEGORY_LABELS__
    if '__CATEGORY_LABELS__' not in globals():
        __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL)

    return __CATEGORY_LABELS__[category] if category in __CATEGORY_LABELS__ else cat_util.category2name(category)
Пример #7
0
def get_topics(category: str) -> set:
    """Return the topics for the given category."""
    global __TOPICS__
    if '__TOPICS__' not in globals():
        __TOPICS__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.topical_concepts')], rdf_util.PREDICATE_SUBJECT)

    return __TOPICS__[category]
Пример #8
0
def get_resource_property_mapping() -> dict:
    """Return a mapping from DBpedia resources to a dict containing property-value assignments (containing facts of DBpedia)."""
    global __RESOURCE_PROPERTY_MAPPING__
    if '__RESOURCE_PROPERTY_MAPPING__' not in globals():
        property_files = [utils.get_data_file('files.dbpedia.mappingbased_literals'), utils.get_data_file('files.dbpedia.mappingbased_objects')]
        initializer = lambda: rdf_util.create_dict_from_rdf(property_files)
        __RESOURCE_PROPERTY_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_properties', initializer)
    return __RESOURCE_PROPERTY_MAPPING__
Пример #9
0
def get_equivalent_types(dbp_type: str) -> set:
    """Return the set of equivalent types to the given type (including itself)."""
    global __EQUIVALENT_TYPE_MAPPING__
    if '__EQUIVALENT_TYPE_MAPPING__' not in globals():
        __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True)
        # remove external types from equivalent mappings as they are prone to errors
        __EQUIVALENT_TYPE_MAPPING__ = defaultdict(set, {t: {et for et in __EQUIVALENT_TYPE_MAPPING__[t] if dbp_util.is_dbp_type(et) or et == rdf_util.CLASS_OWL_THING} for t in __EQUIVALENT_TYPE_MAPPING__ if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING})
    return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
Пример #10
0
def get_resources(category: str) -> set:
    """Return all resources of the given category."""
    global __CATEGORY_RESOURCES__
    if '__CATEGORY_RESOURCES__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT, reverse_key=True)
        __CATEGORY_RESOURCES__ = utils.load_or_create_cache('dbpedia_category_resources', initializer)

    return __CATEGORY_RESOURCES__[category]
Пример #11
0
def get_resource_categories(dbp_resource: str) -> set:
    """Return all categories the given resource is contained in."""
    global __RESOURCE_CATEGORIES__
    if '__RESOURCE_CATEGORIES__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT)
        __RESOURCE_CATEGORIES__ = utils.load_or_create_cache('dbpedia_resource_categories', initializer)

    return __RESOURCE_CATEGORIES__[dbp_resource]
Пример #12
0
    def load_info(self, body):
        if body == CelestialBodyType.CELESTIAL_BODY:
            # self.load_file()
            pass

        elif body == CelestialBodyType.STAR:
            # self.load_file()
            pass

        elif body == CelestialBodyType.PLANET:
            # self.load_file()
            pass

        elif body == CelestialBodyType.DWARF_PLANET:
            # self.load_file()
            pass

        elif body == CelestialBodyType.NATURAL_SATELLITE:
            # self.load_file()
            pass

        elif body == CelestialBodyType.SUN:
            # self.load_file()
            pass

        elif body == CelestialBodyType.MERCURY:
            self.load_file(get_data_file("mercury"))

        elif body == CelestialBodyType.VENUS:
            self.load_file(get_data_file("venus"))

        elif body == CelestialBodyType.EARTH:
            self.load_file(get_data_file("earth"))

        elif body == CelestialBodyType.MARS:
            self.load_file(get_data_file("mars"))

        elif body == CelestialBodyType.JUPITER:
            self.load_file(get_data_file("jupiter"))

        elif body == CelestialBodyType.SATURN:
            self.load_file(get_data_file("saturn"))

        elif body == CelestialBodyType.URANUS:
            self.load_file(get_data_file("uranus"))

        elif body == CelestialBodyType.NEPTUNE:
            self.load_file(get_data_file("neptune"))

        elif body == CelestialBodyType.MOON:
            # self.load_file()
            pass
Пример #13
0
def resolve_redirect(dbp_resource: str, visited=None) -> str:
    """Return the resource to which `dbp_resource` redirects (if any) or `dbp_resource` itself."""
    global __REDIRECTS__
    if '__REDIRECTS__' not in globals():
        initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.redirects')], rdf_util.PREDICATE_REDIRECTS)
        __REDIRECTS__ = utils.load_or_create_cache('dbpedia_resource_redirects', initializer)

    if dbp_resource in __REDIRECTS__:
        visited = visited or set()
        if dbp_resource not in visited:
            return resolve_redirect(__REDIRECTS__[dbp_resource], visited | {dbp_resource})
    return dbp_resource
Пример #14
0
def _compute_inverse_lexicalisations():
    # count how often a lexicalisation points to a given resource
    inv_lex_counts = rdf_util.create_multi_val_count_dict_from_rdf([utils.get_data_file('files.dbpedia.anchor_texts')], rdf_util.PREDICATE_ANCHOR_TEXT, reverse_key=True)
    # make sure that redirects are taken into account
    for lex, resources in inv_lex_counts.items():
        for res in set(resources):
            redirect_res = resolve_redirect(res)
            if res != redirect_res:
                inv_lex_counts[lex][redirect_res] += inv_lex_counts[lex][res]
                del inv_lex_counts[lex][res]
    # convert to frequencies before returning
    return defaultdict(dict, {sub: {obj: count / sum(inv_lex_counts[sub].values()) for obj, count in obj_counts.items()} for sub, obj_counts in inv_lex_counts.items()})
Пример #15
0
def get_object_for_label(label: str) -> str:
    """Return the object that fits the given label."""
    global __RESOURCE_INVERSE_LABELS__
    global __ONTOLOGY_INVERSE_LABELS__
    if '__RESOURCE_INVERSE_LABELS__' not in globals():
        __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()}
        ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)
        __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()}
    if label in __ONTOLOGY_INVERSE_LABELS__:
        return __ONTOLOGY_INVERSE_LABELS__[label]
    if label in __RESOURCE_INVERSE_LABELS__:
        return __RESOURCE_INVERSE_LABELS__[label]
    return dbp_util.name2resource(label)
Пример #16
0
def get_disjoint_types(dbp_type: str) -> set:
    """Return all types that are disjoint with `dbp_type` (excluding the wrong disjointness Agent<->Place)."""
    global __DISJOINT_TYPE_MAPPING__
    if '__DISJOINT_TYPE_MAPPING__' not in globals():
        __DISJOINT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_DISJOINT_WITH, reflexive=True)
        # add/remove custom axioms
        __DISJOINT_TYPE_MAPPING__ = defaultdict(set, {k: {v for v in values if {k, v} not in REMOVED_DISJOINTNESS_AXIOMS} for k, values in __DISJOINT_TYPE_MAPPING__.items()})
        for a, b in ADDED_DISJOINTNESS_AXIOMS:
            __DISJOINT_TYPE_MAPPING__[a].add(b)
            __DISJOINT_TYPE_MAPPING__[b].add(a)

        # completing the subtype of each type with the subtypes of its disjoint types
        __DISJOINT_TYPE_MAPPING__ = defaultdict(set, {t: {st for dt in disjoint_types for st in get_transitive_subtype_closure(dt)} for t, disjoint_types in __DISJOINT_TYPE_MAPPING__.items()})
    return __DISJOINT_TYPE_MAPPING__[dbp_type]
Пример #17
0
def _retrieve_training_data_gs(nlp: Language):
    training_data = []
    with open(utils.get_data_file(
            'files.listpages.goldstandard_named-entity-tagging'),
              mode='r') as f:
        for line in f:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                entities.append(
                    (point['start'], point['end'] + 1, annotation['label'][0]))
            training_data.append(
                Example.from_dict(nlp.make_doc(text), {'entities': entities}))
    return training_data
Пример #18
0
def _retrieve_plaintexts() -> Tuple[str, str]:
    """Return an iterator over DBpedia resources and their Wikipedia plaintexts."""
    with bz2.open(utils.get_data_file('files.dbpedia.nif_context'),
                  mode='rb') as nif_file:
        nif_collection = pynif.NIFCollection.loads(nif_file.read(),
                                                   format='turtle')
        for nif_context in nif_collection.contexts:
            resource_uri = nif_context.original_uri[:nif_context.original_uri.
                                                    rfind('?')]
            # remove parentheses and line breaks from text for easier parsing
            resource_plaintext = nif_context.mention.replace('\n', ' ')
            resource_plaintext = nlp_util.remove_bracket_content(
                resource_plaintext, substitute='')
            resource_plaintext = nlp_util.remove_bracket_content(
                resource_plaintext, bracket_type='[', substitute='')
            yield resource_uri, resource_plaintext
Пример #19
0
def compute_hypernyms(category_graph) -> dict:
    """Retrieves all hypernym relationships from the three sources (Wiki corpus, WebIsALOD, Category axioms)."""
    hypernyms = defaultdict(set)

    # collect hypernyms from axiom matches between Wikipedia categories
    cat_headlemmas = category_graph.get_node_LHS()
    axiom_hypernyms = defaultdict(lambda: defaultdict(int))
    for parent, child in category_graph.get_axiom_edges():
        for cl in cat_headlemmas[child]:
            for pl in cat_headlemmas[parent]:
                axiom_hypernyms[cl.lower()][pl.lower()] += 1

    # load remaining hypernyms
    wiki_hypernyms = utils.load_cache('wikipedia_hypernyms')
    webisalod_data = pickle.load(
        bz2.open(utils.get_data_file('files.dbpedia.webisalod_hypernyms'),
                 mode='rb'))
    webisalod_hypernyms = defaultdict(dict)
    for parent, child, conf in webisalod_data:
        webisalod_hypernyms[child][parent] = conf

    # merge hypernyms
    candidates = set(axiom_hypernyms) | set(wiki_hypernyms) | set(
        webisalod_hypernyms)
    for candidate in candidates:
        hyper_count = defaultdict(int)
        if candidate in axiom_hypernyms:
            for word, count in axiom_hypernyms[candidate].items():
                if count >= THRESHOLD_AXIOM:
                    hyper_count[word] += 2
        if candidate in wiki_hypernyms:
            for word, count in wiki_hypernyms[candidate].items():
                if count >= THRESHOLD_WIKI:
                    hyper_count[word] += 1
        if candidate in webisalod_hypernyms:
            for word, conf in webisalod_hypernyms[candidate].items():
                if conf >= THRESHOLD_WEBISALOD:
                    hyper_count[word] += 1
        hypernyms[candidate] = {
            word
            for word, count in hyper_count.items() if count > 1
        }

    return hypernyms
Пример #20
0
def get_disambiguation_mapping() -> dict:
    global __DISAMBIGUATIONS__
    if '__DISAMBIGUATIONS__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.disambiguations')], rdf_util.PREDICATE_DISAMBIGUATES)
        __DISAMBIGUATIONS__ = defaultdict(set, utils.load_or_create_cache('dbpedia_resource_disambiguations', initializer))
    return __DISAMBIGUATIONS__
Пример #21
0
def _get_label_mapping() -> dict:
    global __RESOURCE_LABEL_MAPPING__
    if '__RESOURCE_LABEL_MAPPING__' not in globals():
        initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.labels')], rdf_util.PREDICATE_LABEL)
        __RESOURCE_LABEL_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_labels', initializer)
    return __RESOURCE_LABEL_MAPPING__
Пример #22
0
                None)


class Planet:
    def __init__(self, planet_data):
        self.name, self.resource, self.influence, self.planet_type, self.planet_tech = parse_planet_data(
            planet_data)


class System:
    def __init__(self, planet_names):
        self.planets = [
            get_planet(planet_name) for planet_name in planet_names
        ]


PLANETS = [
    Planet(planet_data) for planet_data in utils.get_data_file(PLANETS_PATH)
]
SYSTEMS = [
    System(planet_names) for planet_names in utils.get_data_file(SYSTEMS_PATH)
]
print(len(PLANETS))

names1 = [planet.name for planet in PLANETS]

names2 = [[planet.name for planet in system.planets] for system in SYSTEMS]

import more_itertools

print(sorted(names1) == sorted(more_itertools.flatten(names2)))
Пример #23
0
def get_main_equivalence_types() -> set:
    global __MAIN_EQUIVALENCE_TYPES__
    if '__MAIN_EQUIVALENCE_TYPES__' not in globals():
        __MAIN_EQUIVALENCE_TYPES__ = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, None)
    return __MAIN_EQUIVALENCE_TYPES__
Пример #24
0
def get_equivalent_predicates(dbp_predicate: str) -> set:
    """Return all equivalent predicates of a given predicate."""
    global __EQUIVALENT_PREDICATE__
    if '__EQUIVALENT_PREDICATE__' not in globals():
        __EQUIVALENT_PREDICATE__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_PROPERTY)
    return __EQUIVALENT_PREDICATE__[dbp_predicate]
Пример #25
0
def get_range(dbp_predicate: str) -> Optional[str]:
    """Return the range of a given predicate."""
    global __PREDICATE_RANGE__
    if '__PREDICATE_RANGE__' not in globals():
        __PREDICATE_RANGE__ = defaultdict(lambda: None, rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE))
    return __PREDICATE_RANGE__[dbp_predicate]
Пример #26
0
def _get_resource_type_mapping() -> dict:
    global __RESOURCE_TYPE_MAPPING__
    if '__RESOURCE_TYPE_MAPPING__' not in globals():
        initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.instance_types')], rdf_util.PREDICATE_TYPE)
        __RESOURCE_TYPE_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_type_mapping', initializer)
    return __RESOURCE_TYPE_MAPPING__
Пример #27
0
    technologies = starting_values['technologies']
    units = starting_values['units']
    home_planets = starting_values['home planets']
    commodities = starting_values['commodities']

    return name, technologies, units, home_planets, commodities


def print_race_data(race):
    print('Race name:\n\t', race.name)
    print('Techs:\n\t', ', '.join(race.technologies))
    print('Starting planets:\n\t', '\n\t '.join(race.home_planets))
    print('Units:', )
    for unit in race.units:
        print('\t', unit, ':', race.units[unit])
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')


class Race:
    def __init__(self, race_data):
        self.name, self.technologies, self.units, planet_names, self.commodities = parse_race_data(
            race_data)
        self.home_planets = [
            planet.get_planet(planet_name) for planet_name in planet_names
        ]


RACES = [
    Race(race_info) for race_info in utils.get_data_file(RACES_PATH).items()
]
random.shuffle(RACES)
Пример #28
0
 def _show_info_cb(self, widget):
     if self.screen != Screen.INFO:
         self.set_screen(Screen.INFO)
         self.info_view.load_file(get_data_file("index"))
Пример #29
0
def _parse_raw_markup_from_xml() -> dict:
    utils.get_logger().info('WIKIPEDIA/XML: Parsing raw markup from XML dump..')
    parser = etree.XMLParser(target=WikiPageParser())
    with bz2.open(utils.get_data_file('files.wikipedia.pages')) as dbp_pages_file:
        page_markup = etree.parse(dbp_pages_file, parser)
        return {dbp_util.name2resource(p): markup for p, markup in page_markup.items()}