Пример #1
0
def _get_lines_dbpedia_instance_transitive_caligraph_types(graph) -> list:
    """Serialize transitive CaLiGraph types for DBpedia resources."""
    instance_transitive_clg_types = []

    caligraph_ancestors = defaultdict(set)
    for n in graph.traverse_nodes_topdown():
        parents = graph.parents(n)
        caligraph_ancestors[n] = parents | {
            a
            for p in parents for a in caligraph_ancestors[p]
        }

    for res in graph.get_all_resources():
        dbp_res = clg_util.clg_resource2dbp_resource(res)
        if dbp_res not in dbp_store.get_resources():
            continue

        types = graph.get_nodes_for_resource(res)
        direct_types = types.difference(
            {a
             for t in types for a in caligraph_ancestors[t]})
        transitive_types = {
            tt
            for t in direct_types for tt in graph.ancestors(t)
        }.difference(direct_types | {rdf_util.CLASS_OWL_THING})
        instance_transitive_clg_types.extend([
            serialize_util.as_object_triple(dbp_res, rdf_util.PREDICATE_TYPE,
                                            tt) for tt in transitive_types
        ])
    return instance_transitive_clg_types
Пример #2
0
def _compute_inverse_type_frequencies() -> dict:
    predicate_types = defaultdict(set)
    for r in dbp_store.get_resources():
        for pred in dbp_store.get_properties(r):
            predicate_types[pred].update(dbp_store.get_transitive_types(r))

    overall_type_count = len(dbp_store.get_all_types())
    return {
        pred: math.log(overall_type_count / (len(predicate_types[pred]) + 1))
        for pred in dbp_store.get_all_predicates()
    }
Пример #3
0
def _get_resource_surface_scores(text):
    """Return resource lexicalisation scores for the given text."""
    resource_surface_scores = {}
    if not text:
        return resource_surface_scores
    resource_surface_scores[text] = 1
    direct_match = dbp_store.resolve_redirect(dbp_util.name2resource(text))
    if direct_match in dbp_store.get_resources():
        resource_surface_scores[direct_match] = 1
    for surface_match, frequency in sorted(dbp_store.get_inverse_lexicalisations(text.lower()).items(), key=operator.itemgetter(1)):
        resource_surface_scores[surface_match] = frequency
    return resource_surface_scores
Пример #4
0
def _compute_property_frequencies() -> dict:
    property_frequencies = defaultdict(lambda: defaultdict(int))
    for r in dbp_store.get_resources():
        types = dbp_store.get_transitive_types(r)
        for pred, values in dbp_store.get_properties(r).items():
            for t in types:
                property_frequencies[t][pred] += len(values)
    return defaultdict(
        lambda: defaultdict(float), {
            t: defaultdict(
                float, {
                    pred: (1 + math.log(count) if count > 0 else 0)
                    for pred, count in property_frequencies[t].items()
                })
            for t in property_frequencies
        })
Пример #5
0
def _get_lines_instances_dbpedia_mapping(graph) -> list:
    """Serialize DBpedia mapping for resources."""
    lines_instances_dbpedia_mapping = []
    axiom_resources = {
        ax[1]
        for n in graph.nodes for ax in graph.get_axioms(n, transitive=False)
        if clg_util.is_clg_resource(ax[1])
    }
    for res in graph.get_all_resources() | axiom_resources:
        equivalent_res = clg_util.clg_resource2dbp_resource(res)
        if equivalent_res in dbp_store.get_resources():
            lines_instances_dbpedia_mapping.append(
                serialize_util.as_object_triple(res,
                                                rdf_util.PREDICATE_SAME_AS,
                                                equivalent_res))
    return lines_instances_dbpedia_mapping
Пример #6
0
def _extract_axioms(patterns: dict) -> set:
    """Return the axioms extracted by applying the patterns to Wikipedia categories."""
    axioms = {}

    for cat, (sub, pred, subcats) in patterns.items():
        if pred:  # simple mapping of label to predicate (case 1)
            if pred.lower() in predicate_names:
                axioms[cat] = (sub, predicate_names[pred.lower()], subcats)
        else:  # Voting required to discover Z (case 2)
            predicate_counts = defaultdict(int)
            for subcat, value in subcats.items():
                value = normalize_val(value)
                for res in cat_store.get_resources(subcat):
                    for pred, values in dbp_store.get_properties(res).items():
                        normalized_values = {
                            normalize_val(val)
                            for val in values
                        }
                        if value in normalized_values:
                            predicate_counts[pred] += 1
            if predicate_counts:
                pred = max(predicate_counts.items(),
                           key=operator.itemgetter(1))[0]
                axioms[cat] = (sub, pred, subcats)

    # map values to dbpedia resources if necessary (only possible if we have an object property)
    valid_axioms = {}

    for cat in axioms:
        _, pred, subcats = axioms[cat]
        if dbp_store.is_object_property(pred):
            for subcat, obj in subcats.items():
                obj_uri = dbp_util.name2resource(obj)
                if obj_uri in dbp_store.get_resources():
                    if cat in valid_axioms:
                        valid_axioms[cat][1][subcat] = obj_uri
                    else:
                        valid_axioms[cat] = (pred, {subcat: obj_uri})
        else:
            valid_axioms[cat] = (pred, subcats)

    return {(cat, pred, val)
            for pred, cat_vals in valid_axioms.values()
            for cat, val in cat_vals.items()}
Пример #7
0
def _get_lines_dbpedia_instances(graph) -> list:
    """Serialize new DBpedia resources in DBpedia namespace."""
    lines_dbpedia_instances = []
    new_instances = {
        clg_util.clg_resource2dbp_resource(res)
        for res in graph.get_all_resources()
    }.difference(dbp_store.get_resources())
    for inst in new_instances:
        lines_dbpedia_instances.append(
            serialize_util.as_object_triple(
                inst, rdf_util.PREDICATE_TYPE,
                rdf_util.CLASS_OWL_NAMED_INDIVIDUAL))
        label = graph.get_label(clg_util.dbp_resource2clg_resource(inst))
        if label:
            lines_dbpedia_instances.append(
                serialize_util.as_literal_triple(inst,
                                                 rdf_util.PREDICATE_LABEL,
                                                 label))
    return lines_dbpedia_instances
Пример #8
0
def _compute_type_resource_scores(graph, node: str,
                                  direct_resources_only: bool) -> dict:
    node_resources = graph.get_resources_from_categories(node)
    if not direct_resources_only or len(
        [r for r in node_resources if dbp_store.get_types(r)]) < 5:
        node_resources.update({
            r
            for sn in graph.descendants(node)
            for r in graph.get_resources_from_categories(sn)
        })
    node_resources = node_resources.intersection(dbp_store.get_resources())
    if len(node_resources) < 5:
        return {
        }  # better not return anything, if number of resources is too small
    type_counts = defaultdict(int)
    for res in node_resources:
        for t in dbp_store.get_transitive_types(res):
            type_counts[t] += 1
    return {t: count / len(node_resources) for t, count in type_counts.items()}
Пример #9
0
def _apply_rules(pattern_dict: dict, cat: str) -> set:
    """Apply rules form `pattern_dict` and return the implied axioms."""
    cat_words = cat_store.get_label(cat).split(' ')

    axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words)
    if not axiom_patterns:
        return set()

    (pred, pred_type), additional_axioms = axiom_patterns
    front_pattern_idx = pattern_lengths[0] or None
    back_pattern_idx = -1 * pattern_lengths[1] or None
    resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx])

    if pred_type:
        resource = dbp_util.name2resource(resource)
        if resource not in dbp_store.get_resources(
        ) or pred_type not in dbp_store.get_transitive_types(resource):
            return set()
    return {(cat, pred, resource)} | {(cat, pred, val)
                                      for pred, val in additional_axioms}
Пример #10
0
    def get_resource_stats(self, node: str) -> dict:
        """Return resource stats of a node (i.e. resource count and property count)."""
        if node not in self._node_resource_stats:
            resource_count = 0
            new_resource_count = 0
            property_counts = defaultdict(int)

            transitive_resource_count = 0
            transitive_new_resource_count = 0
            transitive_property_counts = defaultdict(int)

            for res in self.get_resources_from_categories(node):
                if res in dbp_store.get_resources():
                    resource_count += 1
                    transitive_resource_count += 1
                    for pred, values in dbp_store.get_properties(res).items():
                        for val in values:
                            property_counts[(pred, val)] += 1
                            transitive_property_counts[(pred, val)] += 1
                else:
                    new_resource_count += 1
                    transitive_new_resource_count += 1
            for child in self.children(node):
                child_stats = self.get_resource_stats(child)
                transitive_resource_count += child_stats[
                    'transitive_resource_count']
                transitive_new_resource_count += child_stats[
                    'transitive_new_resource_count']
                for prop, count in child_stats[
                        'transitive_property_counts'].items():
                    transitive_property_counts[prop] += count
            self._node_resource_stats[node] = {
                'resource_count': resource_count,
                'new_resource_count': new_resource_count,
                'property_counts': property_counts,
                'transitive_resource_count': transitive_resource_count,
                'transitive_new_resource_count': transitive_new_resource_count,
                'transitive_property_counts': transitive_property_counts
            }
        return self._node_resource_stats[node]
Пример #11
0
def _get_lines_dbpedia_instance_types(graph) -> list:
    """Serialize new types for DBpedia resources in DBpedia namespace."""
    new_dbpedia_types = defaultdict(set)
    for node in graph.nodes:
        node_types = graph.get_transitive_dbpedia_types(node,
                                                        force_recompute=True)
        transitive_node_types = {
            tt
            for t in node_types
            for tt in dbp_store.get_transitive_supertype_closure(t)
        }.difference({rdf_util.CLASS_OWL_THING})
        for res in graph.get_resources(node):
            dbp_res = clg_util.clg_resource2dbp_resource(res)
            if dbp_res in dbp_store.get_resources():
                new_dbpedia_types[dbp_res].update(
                    transitive_node_types.difference(
                        dbp_store.get_transitive_types(dbp_res)))
            else:
                new_dbpedia_types[dbp_res].update(transitive_node_types)
    return [
        serialize_util.as_object_triple(res, rdf_util.PREDICATE_TYPE, t)
        for res, types in new_dbpedia_types.items() for t in types
    ]
Пример #12
0
def _get_lines_dbpedia_instance_relations(graph) -> list:
    """Serialize new facts for DBpedia resources in DBpedia namespace."""
    new_instance_relations = set()
    for node in graph.nodes:
        for prop, val in graph.get_axioms(node):
            dbp_prop = clg_util.clg_type2dbp_type(prop)
            dbp_val = clg_util.clg_resource2dbp_resource(
                val) if clg_util.is_clg_resource(val) else val
            for res in graph.get_resources(node):
                dbp_res = clg_util.clg_resource2dbp_resource(res)
                if dbp_res not in dbp_store.get_resources(
                ) or dbp_prop not in dbp_store.get_properties(
                        dbp_res) or dbp_val not in dbp_store.get_properties(
                            dbp_res)[dbp_prop]:
                    new_instance_relations.add((dbp_res, dbp_prop, dbp_val))
    lines_dbpedia_instance_relations = []
    for s, p, o in new_instance_relations:
        if dbp_util.is_dbp_resource(o):
            lines_dbpedia_instance_relations.append(
                serialize_util.as_object_triple(s, p, o))
        else:
            lines_dbpedia_instance_relations.append(
                serialize_util.as_literal_triple(s, p, o))
    return lines_dbpedia_instance_relations
Пример #13
0
def _generate_dbpedia_coverage_graph():
    """Create graph of Figure 4a"""
    # retrieve data from extracted axioms and assertions
    cat2ax_relation_axioms = pd.read_csv(
        util.get_results_file('results.cat2ax.relation_axioms'), sep=';')
    cat2ax_type_axioms = pd.read_csv(
        util.get_results_file('results.cat2ax.type_axioms'), sep=';')
    cat2ax_relation_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.relation_assertions'), sep=';')
    cat2ax_type_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.type_assertions'), sep=';')

    catriple_relation_axioms = pd.read_csv(
        util.get_results_file('results.catriple.relation_axioms'), sep=';')
    catriple_relation_triples = pd.read_csv(
        util.get_results_file('results.catriple.relation_assertions'), sep=';')

    cdf_relation_axioms = pd.read_csv(
        util.get_results_file('results.cdf.relation_axioms'), sep=';')
    cdf_type_axioms = pd.read_csv(
        util.get_results_file('results.cdf.type_axioms'), sep=';')
    cdf_relation_triples = pd.read_csv(
        util.get_results_file('results.cdf.relation_assertions'), sep=';')
    cdf_type_triples = pd.read_csv(
        util.get_results_file('results.cdf.type_assertions'), sep=';')

    # retrieve unique entity counts
    cat2ax_cat_count = len(
        set(cat2ax_relation_axioms['cat'].unique())
        | set(cat2ax_type_axioms['cat'].unique()))
    catriple_cat_count = len(set(catriple_relation_axioms['cat'].unique()))
    cdf_cat_count = len(
        set(cdf_relation_axioms['cat'].unique())
        | set(cdf_type_axioms['cat'].unique()))
    total_cat_count = len(cat_store.get_usable_cats())

    cat2ax_preds = cat2ax_relation_triples.groupby(by='pred').count()
    cat2ax_pred_count = len(cat2ax_preds[cat2ax_preds['sub'] >= 100].index)
    catriple_preds = catriple_relation_triples.groupby(by='pred').count()
    catriple_pred_count = len(
        catriple_preds[catriple_preds['sub'] >= 100].index)
    cdf_preds = cdf_relation_triples.groupby(by='pred').count()
    cdf_pred_count = len(cdf_preds[cdf_preds['sub'] >= 100].index)
    total_pred_count = len(dbp_store.get_all_predicates())

    cat2ax_res_count = len(
        set(cat2ax_relation_triples['sub'].unique())
        | set(cat2ax_type_triples['sub'].unique()))
    catriple_res_count = len(set(catriple_relation_triples['sub'].unique()))
    cdf_res_count = len(
        set(cdf_relation_triples['sub'].unique())
        | set(cdf_type_triples['sub'].unique()))
    total_res_count = len(dbp_store.get_resources())

    # initialise bars
    bars_ca = [
        cat2ax_cat_count / total_cat_count, cat2ax_res_count / total_res_count,
        cat2ax_pred_count / total_pred_count
    ]
    bars_ct = [
        catriple_cat_count / total_cat_count,
        catriple_res_count / total_res_count,
        catriple_pred_count / total_pred_count
    ]
    bars_cdf = [
        cdf_cat_count / total_cat_count, cdf_res_count / total_res_count,
        cdf_pred_count / total_pred_count
    ]

    # arrange bars
    bar_width = 0.25
    r1 = np.arange(len(bars_ca))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # make plot
    plt.figure(figsize=(8, 5))
    plt.bar(r1,
            bars_ca,
            color='#2d7f5e',
            width=bar_width,
            edgecolor='white',
            label='Cat2Ax')
    plt.bar(r2,
            bars_ct,
            color='darkgrey',
            width=bar_width,
            edgecolor='white',
            label='Catriple')
    plt.bar(r3,
            bars_cdf,
            color='black',
            width=bar_width,
            edgecolor='white',
            label='C-DF')
    plt.ylabel('Fraction of items covered', fontsize=16)
    plt.xticks([r + bar_width for r in range(len(bars_ca))],
               ['(1) Categories', '(2) Resources', '(3) Properties'],
               fontsize=16)
    plt.yticks(fontsize=14)
    plt.legend(fontsize=15)
    ax = plt.gca()
    ax.yaxis.grid()

    plt.savefig(util.get_results_file('results.graphs.dbpedia_coverage'),
                bbox_inches='tight')