示例#1
0
class SubM:

    # Initializer
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)
        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)
        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

    # the graph includes all the triples with subClassOf as predicate
    def setup_graph(self):
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))
        self.graph.add_edges_from(collect_pairs)

    # for the sake of effeciency, we remove all the leaf nodes of the graph (classes
    # that does not have subclasses. They don't participate in any cycle by definition)
    def filter_leaf_classes(self):
        for c in self.graph.nodes:
            #test if this node is a leaf
            (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c),
                                      mode="default")
            if cardi == 0:
                self.leaf_classes.add(c)
        print('there are a total of', len(self.leaf_classes),
              'leaf nodes removed')
        for c in self.leaf_classes:
            self.remove_class(c)

    # a similar funtion as that of networkx
    def remove_class(self, c, comment='remove'):
        if self.graph.has_node(c):
            self.graph.remove_node(
                c)  # this also removes all the edges related
            # self.diagnosed_classes[c]= comment
        # automatically,  remove the related edges connected

    # a similar funtion as that of networkx
    def remove_class_from(self, cs, comment='remove'):
        for c in cs:
            self.remove_class(c, comment)

    # This is for future use of the SUBMASSIVE system. A user may ignore this for now.
    def enquiry(self, query, mode="subm"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    # Similar as that of networkx
    def remove_relation(self, sub, sup, comment='remove'):
        if self.graph.has_edge(sub, sup):
            self.graph.remove_edge(sub, sup)
            self.diagnose_relations(sub, sup, comment)

    # Similar as that of networkx
    def remove_relation_from(self, relation_list, comment='remove'):
        for (sub, sup) in relation_list:
            self.remove_relation(sub, sup, comment)

    # there is only one term that has different id when retrieved as Subject or Object
    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    # there is only one term that has different id when retrieved as Subject or Object
    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    # remove the reflexive edges
    def filter_reflexsive(self):
        to_remove = set()
        file = open('reflexive.csv', 'w', newline='')
        writer = csv.writer(file)
        writer.writerow([
            "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
            "DECISION"
        ])
        for e in self.graph.edges():
            (l, r) = e
            if l == r:
                to_remove.add(e)
        print('Number of removed reflexive relations', len(to_remove))
        for (l, r) in to_remove:
            l_term = self.convert_to_term(l)
            r_term = self.convert_to_term(r)
            writer.writerow([l, l_term, r, r_term, 'remove', 'o'])

        self.graph.remove_edges_from(list(to_remove))

    def print_graph_info(self):
        print('there are ', len(self.graph.nodes()), ' nodes')
        print('there are ', len(self.graph.edges()), ' edges')

    # compare against the owl:sameAs relations and rdfs:equivalentClass relations
    # at each iteration, if there is such a edge, then remove this one.
    def obtain_unnecessary_relations(self):
        to_remove = set()
        file = open('equivalent-unnecessary-relations.csv', 'w', newline='')
        writer = csv.writer(file)
        writer.writerow([
            "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
            "DECISION"
        ])
        count_i = 0
        count_s = 0
        for e in self.graph.edges():
            label = ''
            (l, r) = e
            # convert to terms
            l_term = self.convert_to_term(l)
            r_term = self.convert_to_term(r)

            # Step 1: Equicalence Class
            (eq_triple_ids,
             cardinality1) = self.enquiry(query=(l, self.id_equivalentClass,
                                                 r),
                                          mode="default")
            (eq_triple_ids,
             cardinality2) = self.enquiry(query=(r, self.id_equivalentClass,
                                                 l),
                                          mode="default")

            if (cardinality1 == 1 or cardinality2 == 1):
                label = 'i'
                count_i += 1
            # Step 2: owl:sameAs
            if (self.equi_graph_manager.test_equivalent(l_term, r_term)):
                label += 's'
                count_s += 1
            if label != '':
                to_remove.add(e)
                writer.writerow([l, l_term, r, r_term, 'remove', label])
        print('count_s = ', count_s)
        print('count_i = ', count_i)
        print('Number of removed unnecessary relations', len(to_remove))
        self.graph.remove_edges_from(list(to_remove))

    # for the sake of memory effeciency, we can load these unnecessary relations directly
    # this is because the sameas data is very big.
    def load_unnecessary_relations(self):  # to self.suggestion_on_relations
        eq_file = open('equivalent-unnecessary-relations.csv', 'r')
        reader = csv.DictReader(eq_file)
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]  # should be remove
            self.suggestion_on_relations.append((s_id, o_id, sug))
        print(len(self.suggestion_on_relations), ' total  relations loaded')

    # load the manual decisions on size-two cycles
    def load_manually_decided_relations(
            self):  # to self.suggestion_on_relations
        man_file = open('lod-two-cycle.csv', 'r')
        reader = csv.DictReader(man_file)
        coll_nodes = []
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            # s = row["SUBJECT"]
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]
            coll_nodes.append(s_id)
            coll_nodes.append(o_id)
            self.suggestion_on_relations.append((s_id, o_id, sug))
        print(len(self.suggestion_on_relations), ' total relations loaded')
        return coll_nodes

    def find_nodes_in_cycles(self, hint_nodes, max, found_min):
        # create a new graph
        tmp_graph = self.graph.copy()
        # find each node that participate in at least one cycle:
        nodes = set()

        flag = True  # flag for debugging
        count_found_cycles = 0
        while flag:
            try:
                c = []
                hint_not_working = False  #flag
                try:
                    c = nx.find_cycle(tmp_graph,
                                      hint_nodes)  # change to simple_cycles ??

                except Exception as e:
                    hint_not_working = True

                if hint_not_working:
                    c = nx.find_cycle(tmp_graph)
                count_found_cycles += 1
                print('Found Cyclce ', count_found_cycles, ' is: ', c)
                c_nodes = [x for (x, y) in c]

                (l_tmp, r_tmp) = random.choice(c)
                tmp_graph.remove_edge(l_tmp, r_tmp)
                nodes.update(c_nodes)
                if len(nodes) >= max and count_found_cycles >= found_min:

                    print('total nodes = ', len(nodes))
                    flag = False
                else:
                    nodes.update(c_nodes)
                    hint_nodes = c_nodes + hint_nodes
            except Exception as e:
                print(e)
                # print("There is no cycle anymore")
                flag = False

        nodes = list(nodes)
        print('there are in total ', len(nodes),
              '  nodes that participate in cycles')
        print(nodes)
        return nodes

    def get_cycles_from_nodes(self, nodes):
        coll_cycles = []  # a list, not a set
        # obtain a subgraph from the nodes
        subg = self.graph.subgraph(nodes)

        simp_c = list(nx.simple_cycles(subg))
        print(' and these nodes has ', len(simp_c),
              ' simple cycles among them')
        # next, process these cycles and get ready to encode
        for c in simp_c:
            if len(c) == 2:
                (l, r) = c
                coll_cycles.append([(l, r), (r, l)])
            else:
                # print ('original = ', c)
                cycle = []
                for i in range(len(c)):
                    j = i + 1
                    if i == len(c) - 1:
                        j = 0
                    cycle.append((c[i], c[j]))
                # print ('cycle = ', cycle)
                coll_cycles.append(cycle)
        return (coll_cycles)

        return coll_cycles  # get ready for encoding
示例#2
0
                                 IdentifierPosition.Predicate)
id_subClassOf = hdt_lod.convert_term(
    "http://www.w3.org/2000/01/rdf-schema#subClassOf",
    IdentifierPosition.Predicate)
id_equivalentClass = hdt_lod.convert_term(
    "http://www.w3.org/2002/07/owl#equivalentClass",
    IdentifierPosition.Predicate)

# output some stats of LOD-a-lot
# we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs)
print("# subjects:", "{:,}".format(hdt_lod.nb_subjects))
print("# predicates:", "{:,}".format(hdt_lod.nb_predicates))
print("# objects:", "{:,}".format(hdt_lod.nb_objects))
(triples, cardinality) = hdt_lod.search_triples("", "", "")
print("# triples:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_type, 0)
print("# rdf:type statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples(
    "", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "")
print("# rdfs:subClassOf statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_equivalentClass, 0)
print("# owl:equivalentClass statements:", "{:,}".format(cardinality))
(triples,
 cardinality) = hdt_lod.search_triples("",
                                       "http://www.w3.org/2002/07/owl#sameAs",
                                       "")
print("# owl:sameAs statements:", "{:,}".format(cardinality))


def serializeObject(obj):
    ser_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
示例#3
0
class SubP:

    # Initializer / Instance Attributes
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)

        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)

        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"
        self.id_subPropertyOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
            IdentifierPosition.Predicate)

        self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty"
        self.id_equivalentProperty = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentProperty",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        print('set up the equivalence class manager')
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

        print('finished initialization')

    def setup_graph(self):
        print('set up the graph')
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subPropertyOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))

        print('there are ', len(collect_pairs), 'edges')
        self.graph.add_edges_from(collect_pairs)

    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    def enquiry(self, query, mode="subp"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    def print_info(self, sbj, obj):
        predicate_names = [
            "http://sw.cyc.com/CycAnnotations_v1#label",
            "http://www.w3.org/2000/01/rdf-schema#comment",
            "http://www.w3.org/2000/01/rdf-schema#label"
        ]

        s_domain = tldextract.extract(sbj).domain
        o_domain = tldextract.extract(obj).domain
        # filter that domain
        # if (s_domain != DOMAIN and o_domain != DOMAIN):
        #     # print (DOMAIN)
        print('SUBJECT: ', sbj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(sbj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')
        print('OBJECT: ', obj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(obj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')

        print('\n\n========================\n\n')

    def export_cycle(self):
        simp_c = list(nx.simple_cycles(self.graph))
        print('find simple cycle in graph')
        print('there are ', len(simp_c), ' simple cycles')

        count1 = 0
        count_others = 0
        count_sameas = 0
        count_eqProp = 0
        count_bigger = 0

        collect_self_loop = []
        collect_eq = []
        collect_others = []
        collect_bigger = []
        for c in simp_c:
            if len(c) == 1:
                count1 += 1
                collect_self_loop.append(c)
            elif len(c) == 2:
                # print (c)
                # for n in c:
                #     t = self.convert_to_term(n)
                #     print ('\t', t)
                # print ('\n')

                l_term = self.convert_to_term(c[0])
                r_term = self.convert_to_term(c[1])

                # id_equivalentProperty
                (subclass_triple_ids, cardinality) = self.enquiry(
                    query=(c[0], self.id_equivalentProperty, c[1]),
                    mode="default")

                # if (self.equi_graph_manager.test_equivalent(l_term, r_term)):
                #     print ('There is a owl:sameAs relation in between')
                #     count_sameas += 1
                #     collect_eq.append(c)

                if (cardinality > 0):
                    print('There is a owl:equivalentProperty in between')
                    count_eqProp += 1
                    collect_eq.append(c)

                else:
                    # self.print_info(c[0], l_term, c[1], r_term)
                    # print ('a longer one for manual decision:',c )
                    # count_others += 1
                    collect_others.append(c)
                count_others += 1
            else:
                count_bigger += 1
                collect_bigger.append((c[0], c[1]))
                collect_bigger.append((c[1], c[2]))
                collect_bigger.append((c[2], c[0]))

        print('there are ', count1, ' reflexive cycles')

        print('there are ', count_sameas, ' sameAs relations')
        print('there are ', count_eqProp, ' eqProp relations')
        print('there are ', count_others, ' size-two cycles')
        print('there are ', count_bigger, ' bigger cycles')
        # export self-loop cycles:

        with open(file_name, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([
                "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
                "DECISION"
            ])
            # write to file
            # print ('collect self loop: ',collect_self_loop)
            for [s_id] in collect_self_loop:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = s_term
                writer.writerow([s_id, s_term, s_id, o_term, 'remove',
                                 'o'])  # removed from automatic method
            for (s_id, o_id) in collect_eq:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'e'])  # removed from automatic method

            for (s_id, o_id) in collect_others:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                self.print_info(s_term, o_term)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 '2'])  # removed from manual step
                writer.writerow([o_id, o_term, s_id, s_term, 'remove',
                                 '2'])  # removed from manual step

            for (s_id, o_id) in collect_bigger:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                # print ('===a longer cycle ===', c)

                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'x'])  # removed from manual step

    def load_removed(self):
        # 'pre-subP.csv'
        subp_file = open('pre-subP.csv', 'r')
        reader = csv.DictReader(subp_file)
        coll_removed = []
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            # s = row["SUBJECT"]
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]  # should be remove

            if (sug == 'remove'):
                coll_removed.append((s_id, o_id))
        print('number of removed edges:', len(coll_removed))
        self.graph.remove_edges_from(coll_removed)

    def test_cycle(self):
        try:
            c = nx.find_cycle(self.graph)  # change to simple_cycles ??
            print('cycle = ', c)

        except Exception as e:
            # hint_not_working = True
            print('no cycle')

    def export_graph_nt(self, name):
        g = Graph()
        for (s_id, o_id) in self.graph.edges:
            s_term = self.convert_to_term(s_id)
            o_term = self.convert_to_term(o_id)
            bob = URIRef("http://www.w3.org/2000/01/rdf-schema#subPropertyOf")
            g.add((URIRef(s_term), bob, URIRef(o_term)))

        # print("--- printing raw triples ---")
        # for s, p, o in g:
        #     print((s, p, o))

        g.serialize(destination=name, format='nt')
示例#4
0
def generate_reduced():
    # Q1 : retrieve the subClassOf relations
    # hdt_file = None
    # output_filename = None
    # output_selfloopClass_filename = None
    # output_leafClass_filename = None
    # output_intermediateClass_filename = None

    # if sys.argv [1] == 'lod':
    hdt_file = HDTDocument(PATH_LOD)
    # output_filename = 'reduced_lod_subClassOf.csv'
    output_selfloopClass_filename = 'lod_reflexive_classes.csv'
    output_leafClass_filename = 'lod_leaf_classes.csv'
    # output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    # else:
    #     hdt_file = HDTDocument(PATH_DBpedia)
    #     output_filename = 'dbpedia_subClassOf.csv'
    #     output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv'
    #     output_leafClass_filename = 'dbpedia_leaf_classes.csv'
    #     output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    count_selfloop = 0
    count_leaf = 0
    count_left = 0
    count_output_after_further_reduced = 0 # count left of the further reduced
    # removed_leaf_classes = []
    (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")

    to_explore_ids = set() # to iterate through
    leaf_ids = set()
    removed_intermediate_ids = set() # removed intermediate nodes
    all_ids = set()
    with open(output_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        with open(output_intermediateClass_filename, 'w', newline='') as inter_file:
            writer_inter = csv.writer(inter_file)
            writer_inter.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])

            # Step 1: remove selfloops and leaf nodes
            with open(output_selfloopClass_filename, 'w', newline='') as selfloop_file:
                writer_selfloop = csv.writer(selfloop_file)
                writer_selfloop.writerow([ "ID", "URI"])

                with open(output_leafClass_filename, 'w', newline='') as leaf_file:
                    writer_leaf = csv.writer(leaf_file)
                    writer_leaf.writerow([ "ID", "URI"])


                    for (s, p, o) in subclass_triples:
                        s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                        o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                        all_ids.add(s_id)
                        all_ids.add(o_id)
                        count += 1
                        # store it in a csv file
                        if s == o: # self loop
                            count_selfloop += 1
                            writer_selfloop.writerow([s_id, s])
                        else:
                            (_, leaf_cardinality) = hdt_file.search_triples("", subClassOf, s)
                            # test if it is a leaf node
                            if leaf_cardinality == 0:
                                # there is no subclass, this is a leaf node/class
                                # write it to a file and store it
                                writer_leaf.writerow([s_id, s])
                                leaf_ids.add(s_id)
                                count_leaf += 1
                                # removed_leaf_classes.append(s)
                            # else:
                            #     # write what's left to the file
                            #     # SKIP: find intermediate for now
                            #     count_left += 1
                            #     writer.writerow([s_id, s, o_id, o])
            print ('count leaf statements = ', count_leaf)
            print ('count leaf (as set) = ', len (leaf_ids))
            print ('count total statements = ', count)
            print ('count_total nodes (as set) = ', len (all_ids))
            print ('NOW  Part 2: Further Reduce ') # further reduce it

            visited_sup = set()
            # near_leaf_sup = set()
            count_one = 0
            count_loop = 0
            for l_id in leaf_ids:
                count_loop += 1
                (leaf_triples, cardinality) = hdt_file.search_triples_ids(l_id, id_subClassOf, 0)
                # get its superclass id : sup_id
                finished_this_leaf = False
                if cardinality == 1:
                    (l_id, lp_id, sup_id) = leaf_triples.next()
                    (_, sub_cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                    if sub_cardinality == 1:
                        # remove this superclass
                        count_one += 1
                        removed_intermediate_ids.add(sup_id)
                        visited_sup.add(sup_id)
                        (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                        for (sup_id, lp_id, supsup_id) in supsup_triples:
                            to_explore_ids.add(supsup_id)
                        finished_this_leaf = True

                # normal process
                if not finished_this_leaf:
                    for (l_id, lp_id, sup_id) in leaf_triples:
                        if (sup_id not in visited_sup):
                            # lo_id = hdt_file.convert_term(lo, IdentifierPosition.Object)
                            (sup_triples, cardinality_back) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                            supflag = True # if this superclass only has leaf nodes
                            if cardinality_back != 1:
                                for (child_id, lp_id, sup_id) in sup_triples:
                                    if child_id not in leaf_ids:
                                        sup_flag = False
                                        break

                            if supflag:
                                # near_leaf_sup.add(sup_id)
                                removed_intermediate_ids.add(sup_id)
                                (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                                for (sup_id, lp_id, supsup_id) in supsup_triples:
                                    to_explore_ids.add(supsup_id)
                            else:
                                to_explore_ids.add (sup_id)
                            visited_sup.add (sup_id)

                if count_loop %100000 ==0:
                    print ('leaf nodes processed:', count_loop)
                    print ('count one = ', count_one)
                    print ('near-leaf nodes = ', len (removed_intermediate_ids))
                    print ('total visited nodes = ', len (visited_sup))
                    print ('non-near-leaf nodes = ', len(visited_sup) - len(removed_intermediate_ids))
                    print ('to explore = ', len(to_explore_ids))
            print ('*********** after this data processing, we have only ', len(to_explore_ids), ' to explore for the next step')
# # finished data- proprocessing,

            record_to_explore_size = len (to_explore_ids)
            record_iteration = 0
            continue_flag = True
            while (len(to_explore_ids) != 0 and continue_flag):
                # print ('still to explore : ', len(to_explore))
                record_iteration +=1
                # iternate through this and
                n_id = to_explore_ids.pop()
                (triples_id, cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, n_id)
                flag = True
                for (ns_id, np_id, no_id) in triples_id:
                    # if each ns is either a leaf or intermediate but removed, then we remove it.
                    # ns_id = hdt_file.convert_term(ns, IdentifierPosition.Object)
                    if ns_id not in leaf_ids and ns_id not in removed_intermediate_ids:
                        # Keep it for now
                        flag = False
                        break
                if flag == True: # we are sure to remove it

                    removed_intermediate_ids.add (n_id)
                else:
                    to_explore_ids.add (n_id) # add back :(

                if record_iteration == 10000:
                    if record_to_explore_size != len (to_explore_ids):
                        # print ('leaf nodes visited = ', count_leaf)
                        print ('total leaf nodes = ', len(leaf_ids))
                        print ('accummulated removed intermediate = ', len (removed_intermediate_ids))
                        print ('still to explore  = ', len (to_explore_ids))
                        print ('record to explore = ', record_to_explore_size)
                        print ('changed = ', record_to_explore_size - len (to_explore_ids))
                        record_iteration = 0
                        record_to_explore_size = len (to_explore_ids)
                    else:
                        continue_flag = False

            # to write down the intermediate removed

            print ('*****size of leaf:', len (leaf_ids))
            print ('*****size of removed intermediate node :', len (removed_intermediate_ids))
            (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
            for (s,p,o) in subclass_triples:
                s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                # count += 1
                # store it in a csv file
                if s != o:
                    # if s is not a leaf node and not a removed intermediate node
                    if (s_id not in leaf_ids) and (s_id not in removed_intermediate_ids):
                        # write what's left to the file
                        count_output_after_further_reduced += 1
                        # print ('count output after further reduced', count_output_after_further_reduced)
                        writer_inter.writerow([s_id, s, o_id, o])
                #     else:
                #         print ('one of them')
                # else:
                #     print ('nothing')

            print ('total entries = ', count)
            print ('total self-loops = ', count_selfloop)
            print ('total leaf nodes/classes = ', count_leaf)
            print ('total left = ', count_left)
            print ('perfectage of reduction: ', count_left/count)
            print ('=====AFTER FURTHER REDUCTION ======')
            print ('There are only ', count_output_after_further_reduced)
            print ('perfectage of reduction: ', count_output_after_further_reduced/count)
示例#5
0
class SubM:

    # Initializer / Instance Attributes
    def __init__(self, path_hdt=PATH_LOD):
        self.hdt = HDTDocument(path_hdt)
        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)
        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)
        self.graph = nx.DiGraph()
        self.equi_graph = nx.Graph()
        self.diagnosed_relations = {}
        self.diagnosed_classes = {}
        self.leaf_classes = set()

    def setup_graph(self):
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))
        self.graph.add_edges_from(collect_pairs)

    def remove_unnecessary_relations(self):
        for n in self.graph.nodes():
            # test if there is an edge between this node and another node which is also in the Graph
            (eq_triple_ids,
             cardinality) = self.enquiry(query=(n, self.id_equivalentClass, 0),
                                         mode="default")
            for (_, _, m) in eq_triple_ids:
                # test if it is in the Graph
                if m in self.graph.nodes():
                    self.remove_relation(n, m, 'equivalence')
            (eq_triple_ids,
             cardinality) = self.enquiry(query=(0, self.id_equivalentClass, n),
                                         mode="default")
            for (m, _, _) in eq_triple_ids:
                # test if it is in the Graph
                if m in self.graph.nodes():
                    self.remove_relation(m, n, 'equivalence')
        print('total relations diagnosed:', len(self.diagnosed_relations))

    def export_graph(self, export_file=None):
        collect_pairs = self.graph.edges
        if export_file is not None:
            file = open(export_file, 'w', newline='')
            writer = csv.writer(file)
            writer.writerow(["SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
            for (s_id, o_id) in collect_pairs:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                writer.writerow([s_term, s_id, o_term, o_id])

    # define a function subgraph, edges
    # G.edges
    # https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.edges.html#networkx.DiGraph.edges

    def remove_all_two_cycles(self):
        to_remove = set()
        for (l, r) in self.graph.edges():
            if (r, l) in self.graph.edges():
                to_remove.add((l, r))
                to_remove.add((r, l))
        print('there are in total', len(to_remove), ' two-cycle edges removed')
        for (l, r) in to_remove:
            self.graph.remove_edge(l, r)

    def load_manual_decisions(self, file, mode="remove"):
        #if mode = ignore, then we don't do anything to those marked as unknown
        #if mode = remove, then we remove all those that are unknown
        l_two = []
        two = open(file, newline='')
        reader_two = csv.DictReader(two)
        for row in reader_two:
            if (row['SUBJECT_ID'],
                    row['OBJECT_ID']) not in self.diagnosed_relations.keys():
                if (row['SUGGESTION'] == 'remove'):
                    self.remove_relation(row['SUBJECT_ID'],
                                         row['OBJECT_ID'],
                                         comment='remove')
                else:
                    self.diagnosed_relations[
                        row['SUBJECT_ID'],
                        row['OBJECT_ID']] = row['SUGGESTION']
            # l_two.append((row['SUBJECT_ID'], row['OBJECT_ID']))
        # if it is labeled as 'remove' then remove,
        # if it is labeled as 'unknown' then depends on the mode it is in
        print('there are in total ', len(l_two),
              ' relations removed from mannual decisions')
        self.graph.remove_edges_from(l_two)

    def enquiry(self, query, mode="subm"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    def remove_relation(self, sub, sup, comment='remove'):
        if self.graph.has_edge(sub, sup):
            self.graph.remove_edge(sub, sup)
            self.diagnose_relations(sub, sup, comment)

    def remove_relation_from(self, relation_list, comment='remove'):
        for (sub, sup) in relation_list:
            self.remove_relation(sub, sup, comment)

    def diagnose_relations(self, sub, sup, comment='default'):
        self.diagnosed_relations[(sub, sup)] = comment
        # change it to a dictionary?

    def diagnose_class(self, c, comment='default'):
        self.diagnosed_class[c] = comment

        # TODO, split the cases of removal and comment

    def remove_class(self, c, comment='remove'):
        if self.graph.has_node(c):
            self.graph.remove_node(
                c)  # this also removes all the edges related
            self.diagnosed_classes[c] = comment
        # TODO, also remove the related edges connected

    def remove_class_from(self, cs, comment='remove'):
        for c in cs:
            self.remove_class(c, comment)

    def filter_leaf_classes(self):
        count = len(self.diagnosed_classes)

        for c in self.graph.nodes:
            #test if this node is a leaf
            (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c),
                                      mode="default")
            if cardi == 0:
                self.leaf_classes.add(c)
        for c in self.leaf_classes:
            self.remove_class(c)
        print('there are a total of',
              len(self.diagnosed_classes) - count, 'leaf nodes removed')

    def get_domain_from_id(self, id):
        t = self.convert_to_term(id)
        return tldextract.extract(t).domain

    def filter_domain_classes(self, domain):
        filtered = set()
        for c in self.graph.nodes:
            t = self.convert_to_term(c)
            if (domain == tldextract.extract(t).domain):
                filtered.add(c)
        print('a total of ', len(filtered), ' removed w.r.t. domain ', domain)
        self.remove_class_from(list(filtered))

    def filter_reflexsive(self):
        to_remove = set()
        for e in self.graph.edges():
            (l, r) = e
            if l == r:
                to_remove.add(e)
        print('removed reflexive relations', len(to_remove))
        self.graph.remove_edges_from(list(to_remove))

    def print_cycles(self):
        count = 0
        flag = True
        while flag:
            try:
                cycle = nx.find_cycle(self.graph)
                print('find cycle', cycle)
                (l, r) = cycle[0]
                print(self.get_domain_from_id(l))
                self.graph.remove_edges_from(cycle)
            except Exception as e:
                print(e)
                flag = False