示例#1
0
def generate():
    hdt_file = None
    output_filename = None
    if sys.argv [1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'all_lod_subClassOf.csv'

    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'all_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    with open(output_filename, 'w', newline='') as file:
        (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        for (s, p, o) in subclass_triples:
            # store it in a csv file
            s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
            o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
            writer.writerow([s_id, s, o_id, o])
            # print ([s_id, s, o_id, o])
            count += 1
    print ('total entries = ', count)
示例#2
0
class SubM:

    # Initializer
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)
        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)
        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

    # the graph includes all the triples with subClassOf as predicate
    def setup_graph(self):
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))
        self.graph.add_edges_from(collect_pairs)

    # for the sake of effeciency, we remove all the leaf nodes of the graph (classes
    # that does not have subclasses. They don't participate in any cycle by definition)
    def filter_leaf_classes(self):
        for c in self.graph.nodes:
            #test if this node is a leaf
            (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c),
                                      mode="default")
            if cardi == 0:
                self.leaf_classes.add(c)
        print('there are a total of', len(self.leaf_classes),
              'leaf nodes removed')
        for c in self.leaf_classes:
            self.remove_class(c)

    # a similar funtion as that of networkx
    def remove_class(self, c, comment='remove'):
        if self.graph.has_node(c):
            self.graph.remove_node(
                c)  # this also removes all the edges related
            # self.diagnosed_classes[c]= comment
        # automatically,  remove the related edges connected

    # a similar funtion as that of networkx
    def remove_class_from(self, cs, comment='remove'):
        for c in cs:
            self.remove_class(c, comment)

    # This is for future use of the SUBMASSIVE system. A user may ignore this for now.
    def enquiry(self, query, mode="subm"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    # Similar as that of networkx
    def remove_relation(self, sub, sup, comment='remove'):
        if self.graph.has_edge(sub, sup):
            self.graph.remove_edge(sub, sup)
            self.diagnose_relations(sub, sup, comment)

    # Similar as that of networkx
    def remove_relation_from(self, relation_list, comment='remove'):
        for (sub, sup) in relation_list:
            self.remove_relation(sub, sup, comment)

    # there is only one term that has different id when retrieved as Subject or Object
    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    # there is only one term that has different id when retrieved as Subject or Object
    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    # remove the reflexive edges
    def filter_reflexsive(self):
        to_remove = set()
        file = open('reflexive.csv', 'w', newline='')
        writer = csv.writer(file)
        writer.writerow([
            "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
            "DECISION"
        ])
        for e in self.graph.edges():
            (l, r) = e
            if l == r:
                to_remove.add(e)
        print('Number of removed reflexive relations', len(to_remove))
        for (l, r) in to_remove:
            l_term = self.convert_to_term(l)
            r_term = self.convert_to_term(r)
            writer.writerow([l, l_term, r, r_term, 'remove', 'o'])

        self.graph.remove_edges_from(list(to_remove))

    def print_graph_info(self):
        print('there are ', len(self.graph.nodes()), ' nodes')
        print('there are ', len(self.graph.edges()), ' edges')

    # compare against the owl:sameAs relations and rdfs:equivalentClass relations
    # at each iteration, if there is such a edge, then remove this one.
    def obtain_unnecessary_relations(self):
        to_remove = set()
        file = open('equivalent-unnecessary-relations.csv', 'w', newline='')
        writer = csv.writer(file)
        writer.writerow([
            "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
            "DECISION"
        ])
        count_i = 0
        count_s = 0
        for e in self.graph.edges():
            label = ''
            (l, r) = e
            # convert to terms
            l_term = self.convert_to_term(l)
            r_term = self.convert_to_term(r)

            # Step 1: Equicalence Class
            (eq_triple_ids,
             cardinality1) = self.enquiry(query=(l, self.id_equivalentClass,
                                                 r),
                                          mode="default")
            (eq_triple_ids,
             cardinality2) = self.enquiry(query=(r, self.id_equivalentClass,
                                                 l),
                                          mode="default")

            if (cardinality1 == 1 or cardinality2 == 1):
                label = 'i'
                count_i += 1
            # Step 2: owl:sameAs
            if (self.equi_graph_manager.test_equivalent(l_term, r_term)):
                label += 's'
                count_s += 1
            if label != '':
                to_remove.add(e)
                writer.writerow([l, l_term, r, r_term, 'remove', label])
        print('count_s = ', count_s)
        print('count_i = ', count_i)
        print('Number of removed unnecessary relations', len(to_remove))
        self.graph.remove_edges_from(list(to_remove))

    # for the sake of memory effeciency, we can load these unnecessary relations directly
    # this is because the sameas data is very big.
    def load_unnecessary_relations(self):  # to self.suggestion_on_relations
        eq_file = open('equivalent-unnecessary-relations.csv', 'r')
        reader = csv.DictReader(eq_file)
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]  # should be remove
            self.suggestion_on_relations.append((s_id, o_id, sug))
        print(len(self.suggestion_on_relations), ' total  relations loaded')

    # load the manual decisions on size-two cycles
    def load_manually_decided_relations(
            self):  # to self.suggestion_on_relations
        man_file = open('lod-two-cycle.csv', 'r')
        reader = csv.DictReader(man_file)
        coll_nodes = []
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            # s = row["SUBJECT"]
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]
            coll_nodes.append(s_id)
            coll_nodes.append(o_id)
            self.suggestion_on_relations.append((s_id, o_id, sug))
        print(len(self.suggestion_on_relations), ' total relations loaded')
        return coll_nodes

    def find_nodes_in_cycles(self, hint_nodes, max, found_min):
        # create a new graph
        tmp_graph = self.graph.copy()
        # find each node that participate in at least one cycle:
        nodes = set()

        flag = True  # flag for debugging
        count_found_cycles = 0
        while flag:
            try:
                c = []
                hint_not_working = False  #flag
                try:
                    c = nx.find_cycle(tmp_graph,
                                      hint_nodes)  # change to simple_cycles ??

                except Exception as e:
                    hint_not_working = True

                if hint_not_working:
                    c = nx.find_cycle(tmp_graph)
                count_found_cycles += 1
                print('Found Cyclce ', count_found_cycles, ' is: ', c)
                c_nodes = [x for (x, y) in c]

                (l_tmp, r_tmp) = random.choice(c)
                tmp_graph.remove_edge(l_tmp, r_tmp)
                nodes.update(c_nodes)
                if len(nodes) >= max and count_found_cycles >= found_min:

                    print('total nodes = ', len(nodes))
                    flag = False
                else:
                    nodes.update(c_nodes)
                    hint_nodes = c_nodes + hint_nodes
            except Exception as e:
                print(e)
                # print("There is no cycle anymore")
                flag = False

        nodes = list(nodes)
        print('there are in total ', len(nodes),
              '  nodes that participate in cycles')
        print(nodes)
        return nodes

    def get_cycles_from_nodes(self, nodes):
        coll_cycles = []  # a list, not a set
        # obtain a subgraph from the nodes
        subg = self.graph.subgraph(nodes)

        simp_c = list(nx.simple_cycles(subg))
        print(' and these nodes has ', len(simp_c),
              ' simple cycles among them')
        # next, process these cycles and get ready to encode
        for c in simp_c:
            if len(c) == 2:
                (l, r) = c
                coll_cycles.append([(l, r), (r, l)])
            else:
                # print ('original = ', c)
                cycle = []
                for i in range(len(c)):
                    j = i + 1
                    if i == len(c) - 1:
                        j = 0
                    cycle.append((c[i], c[j]))
                # print ('cycle = ', cycle)
                coll_cycles.append(cycle)
        return (coll_cycles)

        return coll_cycles  # get ready for encoding
示例#3
0
def generate_reduced():
    # Q1 : retrieve the subClassOf relations
    hdt_file = None
    output_filename = None
    output_selfloopClass_filename = None
    output_leafClass_filename = None
    output_intermediateClass_filename = None

    if sys.argv[1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'reduced_lod_subClassOf.csv'
        output_selfloopClass_filename = 'lod_selfloop_classes.csv'
        output_leafClass_filename = 'lod_leaf_classes.csv'
        output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'dbpedia_subClassOf.csv'
        output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv'
        output_leafClass_filename = 'dbpedia_leaf_classes.csv'
        output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term(
        "http://www.w3.org/2000/01/rdf-schema#subClassOf",
        IdentifierPosition.Predicate)

    (subclass_triples,
     cardinality) = hdt_file.search_triples("", subClassOf, "")

    c1 = ['196338233', '196338418', '196338419']
    c2 = ['196338233', '196338325', '196338412']
    c3 = ['196337995', '196338014', '196338013']
    c4 = ['196338014', '196338063', '196338410']

    cs = [c1, c2, c3, c4]
    for c in cs:
        print('\n\n this cycle = ', c)
        for n in c:
            print('id =', n)
            name = hdt_file.convert_id(int(n), IdentifierPosition.Subject)
            print('name = ', name)
            s_id = hdt_file.convert_term(name, IdentifierPosition.Subject)
            print('when its subject = ', s_id)
            o_id = hdt_file.convert_term(name, IdentifierPosition.Object)
            print('when its object  = ', o_id)

    print('==================================')

    c1 = ['1193056652', '1193056593', '1193056657']
    c2 = ['1146303708', '1146299369', '1146331327']
    c3 = ['196338400', '196338312', '196338288']
    c4 = ['196338013', '196337995', '196338014']
    c5 = ['196338242', '196338410', '196337957']
    c6 = ['196338418', '196338419', '196338233']
    c7 = ['196338233', '196338325', '196338412']
    c8 = ['196338014', '196338063', '196338410']
    c9 = ['196338014', '196337975', '196338007']
    c10 = ['196338050', '196338049', '196337975']
    c11 = ['196338197', '196338462', '196338406']
    c12 = ['196338220', '196338217', '196338034']
    c13 = ['196338145', '196338152', '196338419']
    c14 = ['196338288', '196338116', '196337978']
    c15 = ['196338070', '196338360', '196338241']
    c16 = ['114657709', '114657713', '125181834']

    cs = [
        c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16
    ]
    for c in cs:
        print('\n\n that cycle = ', c)
        for n in c:
            print('id =', n)
            name = hdt_file.convert_id(int(n), IdentifierPosition.Subject)
            print('name = ', name)
            s_id = hdt_file.convert_term(name, IdentifierPosition.Subject)
            print('when its subject = ', s_id)
            o_id = hdt_file.convert_term(name, IdentifierPosition.Object)
            print('when its object  = ', o_id)
示例#4
0
    return u"", exception.end


codecs.register_error("strict", strict_handler)

PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt"
PATH_SAMEAS_NETWORK = "/home/jraad/ssd/data/identity-data/"
PATH_ID2TERMS_099 = "/home/jraad/ssd/data/identity-data-0_99/id2terms_0-99.csv"
PATH_TERM2ID_099 = "/home/jraad/ssd/data/identity-data-0_99/term2id_0-99.csv"

# load the LOD-a-lot HDT file
hdt_lod = HDTDocument(PATH_LOD)

# these identifiers will be used later to query the HDT file using their IDs
id_type = hdt_lod.convert_term(
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
    IdentifierPosition.Predicate)
id_sameAs = hdt_lod.convert_term("http://www.w3.org/2002/07/owl#sameAs",
                                 IdentifierPosition.Predicate)
id_subClassOf = hdt_lod.convert_term(
    "http://www.w3.org/2000/01/rdf-schema#subClassOf",
    IdentifierPosition.Predicate)
id_equivalentClass = hdt_lod.convert_term(
    "http://www.w3.org/2002/07/owl#equivalentClass",
    IdentifierPosition.Predicate)

# output some stats of LOD-a-lot
# we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs)
print("# subjects:", "{:,}".format(hdt_lod.nb_subjects))
print("# predicates:", "{:,}".format(hdt_lod.nb_predicates))
print("# objects:", "{:,}".format(hdt_lod.nb_objects))
示例#5
0
def generate_reduced():
    # Q1 : retrieve the subClassOf relations
    # hdt_file = None
    # output_filename = None
    # output_selfloopClass_filename = None
    # output_leafClass_filename = None
    # output_intermediateClass_filename = None

    # if sys.argv [1] == 'lod':
    hdt_file = HDTDocument(PATH_LOD)
    # output_filename = 'reduced_lod_subClassOf.csv'
    output_selfloopClass_filename = 'lod_reflexive_classes.csv'
    output_leafClass_filename = 'lod_leaf_classes.csv'
    # output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    # else:
    #     hdt_file = HDTDocument(PATH_DBpedia)
    #     output_filename = 'dbpedia_subClassOf.csv'
    #     output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv'
    #     output_leafClass_filename = 'dbpedia_leaf_classes.csv'
    #     output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    count_selfloop = 0
    count_leaf = 0
    count_left = 0
    count_output_after_further_reduced = 0 # count left of the further reduced
    # removed_leaf_classes = []
    (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")

    to_explore_ids = set() # to iterate through
    leaf_ids = set()
    removed_intermediate_ids = set() # removed intermediate nodes
    all_ids = set()
    with open(output_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        with open(output_intermediateClass_filename, 'w', newline='') as inter_file:
            writer_inter = csv.writer(inter_file)
            writer_inter.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])

            # Step 1: remove selfloops and leaf nodes
            with open(output_selfloopClass_filename, 'w', newline='') as selfloop_file:
                writer_selfloop = csv.writer(selfloop_file)
                writer_selfloop.writerow([ "ID", "URI"])

                with open(output_leafClass_filename, 'w', newline='') as leaf_file:
                    writer_leaf = csv.writer(leaf_file)
                    writer_leaf.writerow([ "ID", "URI"])


                    for (s, p, o) in subclass_triples:
                        s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                        o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                        all_ids.add(s_id)
                        all_ids.add(o_id)
                        count += 1
                        # store it in a csv file
                        if s == o: # self loop
                            count_selfloop += 1
                            writer_selfloop.writerow([s_id, s])
                        else:
                            (_, leaf_cardinality) = hdt_file.search_triples("", subClassOf, s)
                            # test if it is a leaf node
                            if leaf_cardinality == 0:
                                # there is no subclass, this is a leaf node/class
                                # write it to a file and store it
                                writer_leaf.writerow([s_id, s])
                                leaf_ids.add(s_id)
                                count_leaf += 1
                                # removed_leaf_classes.append(s)
                            # else:
                            #     # write what's left to the file
                            #     # SKIP: find intermediate for now
                            #     count_left += 1
                            #     writer.writerow([s_id, s, o_id, o])
            print ('count leaf statements = ', count_leaf)
            print ('count leaf (as set) = ', len (leaf_ids))
            print ('count total statements = ', count)
            print ('count_total nodes (as set) = ', len (all_ids))
            print ('NOW  Part 2: Further Reduce ') # further reduce it

            visited_sup = set()
            # near_leaf_sup = set()
            count_one = 0
            count_loop = 0
            for l_id in leaf_ids:
                count_loop += 1
                (leaf_triples, cardinality) = hdt_file.search_triples_ids(l_id, id_subClassOf, 0)
                # get its superclass id : sup_id
                finished_this_leaf = False
                if cardinality == 1:
                    (l_id, lp_id, sup_id) = leaf_triples.next()
                    (_, sub_cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                    if sub_cardinality == 1:
                        # remove this superclass
                        count_one += 1
                        removed_intermediate_ids.add(sup_id)
                        visited_sup.add(sup_id)
                        (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                        for (sup_id, lp_id, supsup_id) in supsup_triples:
                            to_explore_ids.add(supsup_id)
                        finished_this_leaf = True

                # normal process
                if not finished_this_leaf:
                    for (l_id, lp_id, sup_id) in leaf_triples:
                        if (sup_id not in visited_sup):
                            # lo_id = hdt_file.convert_term(lo, IdentifierPosition.Object)
                            (sup_triples, cardinality_back) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                            supflag = True # if this superclass only has leaf nodes
                            if cardinality_back != 1:
                                for (child_id, lp_id, sup_id) in sup_triples:
                                    if child_id not in leaf_ids:
                                        sup_flag = False
                                        break

                            if supflag:
                                # near_leaf_sup.add(sup_id)
                                removed_intermediate_ids.add(sup_id)
                                (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                                for (sup_id, lp_id, supsup_id) in supsup_triples:
                                    to_explore_ids.add(supsup_id)
                            else:
                                to_explore_ids.add (sup_id)
                            visited_sup.add (sup_id)

                if count_loop %100000 ==0:
                    print ('leaf nodes processed:', count_loop)
                    print ('count one = ', count_one)
                    print ('near-leaf nodes = ', len (removed_intermediate_ids))
                    print ('total visited nodes = ', len (visited_sup))
                    print ('non-near-leaf nodes = ', len(visited_sup) - len(removed_intermediate_ids))
                    print ('to explore = ', len(to_explore_ids))
            print ('*********** after this data processing, we have only ', len(to_explore_ids), ' to explore for the next step')
# # finished data- proprocessing,

            record_to_explore_size = len (to_explore_ids)
            record_iteration = 0
            continue_flag = True
            while (len(to_explore_ids) != 0 and continue_flag):
                # print ('still to explore : ', len(to_explore))
                record_iteration +=1
                # iternate through this and
                n_id = to_explore_ids.pop()
                (triples_id, cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, n_id)
                flag = True
                for (ns_id, np_id, no_id) in triples_id:
                    # if each ns is either a leaf or intermediate but removed, then we remove it.
                    # ns_id = hdt_file.convert_term(ns, IdentifierPosition.Object)
                    if ns_id not in leaf_ids and ns_id not in removed_intermediate_ids:
                        # Keep it for now
                        flag = False
                        break
                if flag == True: # we are sure to remove it

                    removed_intermediate_ids.add (n_id)
                else:
                    to_explore_ids.add (n_id) # add back :(

                if record_iteration == 10000:
                    if record_to_explore_size != len (to_explore_ids):
                        # print ('leaf nodes visited = ', count_leaf)
                        print ('total leaf nodes = ', len(leaf_ids))
                        print ('accummulated removed intermediate = ', len (removed_intermediate_ids))
                        print ('still to explore  = ', len (to_explore_ids))
                        print ('record to explore = ', record_to_explore_size)
                        print ('changed = ', record_to_explore_size - len (to_explore_ids))
                        record_iteration = 0
                        record_to_explore_size = len (to_explore_ids)
                    else:
                        continue_flag = False

            # to write down the intermediate removed

            print ('*****size of leaf:', len (leaf_ids))
            print ('*****size of removed intermediate node :', len (removed_intermediate_ids))
            (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
            for (s,p,o) in subclass_triples:
                s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                # count += 1
                # store it in a csv file
                if s != o:
                    # if s is not a leaf node and not a removed intermediate node
                    if (s_id not in leaf_ids) and (s_id not in removed_intermediate_ids):
                        # write what's left to the file
                        count_output_after_further_reduced += 1
                        # print ('count output after further reduced', count_output_after_further_reduced)
                        writer_inter.writerow([s_id, s, o_id, o])
                #     else:
                #         print ('one of them')
                # else:
                #     print ('nothing')

            print ('total entries = ', count)
            print ('total self-loops = ', count_selfloop)
            print ('total leaf nodes/classes = ', count_leaf)
            print ('total left = ', count_left)
            print ('perfectage of reduction: ', count_left/count)
            print ('=====AFTER FURTHER REDUCTION ======')
            print ('There are only ', count_output_after_further_reduced)
            print ('perfectage of reduction: ', count_output_after_further_reduced/count)
示例#6
0
class SubP:

    # Initializer / Instance Attributes
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)

        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)

        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"
        self.id_subPropertyOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
            IdentifierPosition.Predicate)

        self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty"
        self.id_equivalentProperty = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentProperty",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        print('set up the equivalence class manager')
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

        print('finished initialization')

    def setup_graph(self):
        print('set up the graph')
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subPropertyOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))

        print('there are ', len(collect_pairs), 'edges')
        self.graph.add_edges_from(collect_pairs)

    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    def enquiry(self, query, mode="subp"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    def print_info(self, sbj, obj):
        predicate_names = [
            "http://sw.cyc.com/CycAnnotations_v1#label",
            "http://www.w3.org/2000/01/rdf-schema#comment",
            "http://www.w3.org/2000/01/rdf-schema#label"
        ]

        s_domain = tldextract.extract(sbj).domain
        o_domain = tldextract.extract(obj).domain
        # filter that domain
        # if (s_domain != DOMAIN and o_domain != DOMAIN):
        #     # print (DOMAIN)
        print('SUBJECT: ', sbj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(sbj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')
        print('OBJECT: ', obj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(obj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')

        print('\n\n========================\n\n')

    def export_cycle(self):
        simp_c = list(nx.simple_cycles(self.graph))
        print('find simple cycle in graph')
        print('there are ', len(simp_c), ' simple cycles')

        count1 = 0
        count_others = 0
        count_sameas = 0
        count_eqProp = 0
        count_bigger = 0

        collect_self_loop = []
        collect_eq = []
        collect_others = []
        collect_bigger = []
        for c in simp_c:
            if len(c) == 1:
                count1 += 1
                collect_self_loop.append(c)
            elif len(c) == 2:
                # print (c)
                # for n in c:
                #     t = self.convert_to_term(n)
                #     print ('\t', t)
                # print ('\n')

                l_term = self.convert_to_term(c[0])
                r_term = self.convert_to_term(c[1])

                # id_equivalentProperty
                (subclass_triple_ids, cardinality) = self.enquiry(
                    query=(c[0], self.id_equivalentProperty, c[1]),
                    mode="default")

                # if (self.equi_graph_manager.test_equivalent(l_term, r_term)):
                #     print ('There is a owl:sameAs relation in between')
                #     count_sameas += 1
                #     collect_eq.append(c)

                if (cardinality > 0):
                    print('There is a owl:equivalentProperty in between')
                    count_eqProp += 1
                    collect_eq.append(c)

                else:
                    # self.print_info(c[0], l_term, c[1], r_term)
                    # print ('a longer one for manual decision:',c )
                    # count_others += 1
                    collect_others.append(c)
                count_others += 1
            else:
                count_bigger += 1
                collect_bigger.append((c[0], c[1]))
                collect_bigger.append((c[1], c[2]))
                collect_bigger.append((c[2], c[0]))

        print('there are ', count1, ' reflexive cycles')

        print('there are ', count_sameas, ' sameAs relations')
        print('there are ', count_eqProp, ' eqProp relations')
        print('there are ', count_others, ' size-two cycles')
        print('there are ', count_bigger, ' bigger cycles')
        # export self-loop cycles:

        with open(file_name, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([
                "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
                "DECISION"
            ])
            # write to file
            # print ('collect self loop: ',collect_self_loop)
            for [s_id] in collect_self_loop:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = s_term
                writer.writerow([s_id, s_term, s_id, o_term, 'remove',
                                 'o'])  # removed from automatic method
            for (s_id, o_id) in collect_eq:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'e'])  # removed from automatic method

            for (s_id, o_id) in collect_others:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                self.print_info(s_term, o_term)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 '2'])  # removed from manual step
                writer.writerow([o_id, o_term, s_id, s_term, 'remove',
                                 '2'])  # removed from manual step

            for (s_id, o_id) in collect_bigger:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                # print ('===a longer cycle ===', c)

                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'x'])  # removed from manual step

    def load_removed(self):
        # 'pre-subP.csv'
        subp_file = open('pre-subP.csv', 'r')
        reader = csv.DictReader(subp_file)
        coll_removed = []
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            # s = row["SUBJECT"]
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]  # should be remove

            if (sug == 'remove'):
                coll_removed.append((s_id, o_id))
        print('number of removed edges:', len(coll_removed))
        self.graph.remove_edges_from(coll_removed)

    def test_cycle(self):
        try:
            c = nx.find_cycle(self.graph)  # change to simple_cycles ??
            print('cycle = ', c)

        except Exception as e:
            # hint_not_working = True
            print('no cycle')

    def export_graph_nt(self, name):
        g = Graph()
        for (s_id, o_id) in self.graph.edges:
            s_term = self.convert_to_term(s_id)
            o_term = self.convert_to_term(o_id)
            bob = URIRef("http://www.w3.org/2000/01/rdf-schema#subPropertyOf")
            g.add((URIRef(s_term), bob, URIRef(o_term)))

        # print("--- printing raw triples ---")
        # for s, p, o in g:
        #     print((s, p, o))

        g.serialize(destination=name, format='nt')
示例#7
0
def generate():
    # Q1 : retrieve the subClassOf relations
    visited_pair_list = []
    # hdt_file = None
    # output_filename = None
    # if sys.argv [1] == 'lod':
    hdt_file = HDTDocument(PATH_LOD)
    output_filename = 'lod-two-cycle.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term(
        "http://www.w3.org/2000/01/rdf-schema#subClassOf",
        IdentifierPosition.Predicate)
    count = 0
    count_removed = 0
    count_sameas = 0
    count_left = 0
    count_right = 0
    count_sameas = 0
    count_unknown = 0

    cnt_removed = Counter()
    cnt_sameas = Counter()
    cnt_left = Counter()
    cnt_right = Counter()
    cnt_both = Counter()
    cnt_sameas = Counter()
    cnt_unknown = Counter()

    eq_pair_ids = []
    eq_pair_terms = []

    eq_file = open('equivalent-unnecessary-relations.csv', 'r')
    reader = csv.DictReader(eq_file)
    for row in reader:
        s_id = row["SUBJECT_ID"]
        s = row["SUBJECT"]
        o_id = row["OBJECT_ID"]
        o = row["OBJECT"]
        eq_pair_ids.append((s_id, o_id))
        eq_pair_terms.append((s, o))

    with open(output_filename, 'w', newline='') as file:
        (subclass_triples,
         cardinality) = hdt_file.search_triples("", subClassOf, "")
        writer = csv.writer(file)
        writer.writerow([
            "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
            "DECISION"
        ])

        for (s, p, o) in subclass_triples:
            s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
            o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
            if s != o:
                # otherwise, it is a self-loop

                # store it in a csv file
                (reverse_subclass_triples,
                 reverse_cardinality) = hdt_file.search_triples(
                     o, subClassOf, s)
                if reverse_cardinality == 1:  # there is a reverse link back

                    if (s_id, o_id) in eq_pair_ids or (o_id,
                                                       s_id) in eq_pair_ids:
                        print('this is in the equivalence pair, skip it')
                        print(s, '\n', o, '\n\n\n')
                        # Additional: we also make sure it does not appear in the equivalent set
                    else:

                        if (s, o) not in visited_pair_list and (
                                o, s) not in visited_pair_list:
                            # ask the user to deal with it:
                            print('sbj=\t', s)
                            print('obj=\t', o)
                            s_domain = tldextract.extract(s).domain
                            o_domain = tldextract.extract(o).domain
                            print('s_domain = ', s_domain)
                            print('o_domain = ', o_domain)
                            print_ino(s, hdt_file)
                            print_ino(o, hdt_file)
                            decision = input()
                            count += 1

                            if decision == 'x':  # if the entry is meaningless, then remove:
                                writer.writerow(
                                    [s_id, s, o_id, o, 'remove', 'x'])
                                writer.writerow(
                                    [o_id, o, s_id, s, 'remove', 'x'])
                                count_removed += 1
                                cnt_removed[s_domain] += 1
                                cnt_removed[o_domain] += 1
                            elif decision == 'l':
                                writer.writerow(
                                    [s_id, s, o_id, o, 'remove', 'l'])
                                writer.writerow(
                                    [o_id, o, s_id, s, 'remain', 'l'])
                                count_left += 1
                                cnt_left[s_domain] += 1
                                cnt_left[o_domain] += 1
                                cnt_both[s_domain] += 1
                                cnt_both[o_domain] += 1
                            elif decision == 'r':
                                writer.writerow(
                                    [s_id, s, o_id, o, 'remain', 'r'])
                                writer.writerow(
                                    [o_id, o, s_id, s, 'remove',
                                     'r'])  # reverse the order
                                count_right += 1
                                cnt_right[s_domain] += 1
                                cnt_right[o_domain] += 1
                                cnt_both[s_domain] += 1
                                cnt_both[o_domain] += 1
                            elif decision == 'e' or decision == 's':  # equivalent class. remove both of them
                                writer.writerow(
                                    [s_id, s, o_id, o, 'remove', 'e'])
                                writer.writerow(
                                    [o_id, o, s_id, s, 'remove', 'e'])
                                count_sameas += 1
                                cnt_sameas[s_domain] += 1
                                cnt_sameas[o_domain] += 1
                            elif decision == 'u':  # unknown, remains to be dealt with automatic approach
                                # count_unknown
                                writer.writerow(
                                    [s_id, s, o_id, o, 'unknown', 'u'])
                                writer.writerow(
                                    [o_id, o, s_id, s, 'unknown', 'u'])
                                count_unknown += 1
                                cnt_unknown[s_domain] += 1
                                cnt_unknown[o_domain] += 1
                            else:
                                print('user input error')

                        visited_pair_list.append((s, o))
                        visited_pair_list.append((o, s))

                elif reverse_cardinality > 1:
                    print('ERROR: there are multiple rdfs:subClassOf edges: ',
                          reverse_cardinality)
                    print(s, '\t and \t', o)
                    for (s_tmp, p_tmp, o_tmp) in reverse_subclass_triples:
                        print('s = ', s_tmp)
                        print('p = ', p_tmp)
                        print('o = ', o_tmp)

        print('count total pairs = ', count)
        print('count removed = ', count_removed)
        print(cnt_removed)
        print('count left = ', count_left)
        print(cnt_left)
        print('count right = ', count_right)
        print(cnt_right)
        print('===both====')
        print(cnt_both)
        print('===both====')
        print('count equivalent class = ', count_sameas)
        print(cnt_sameas)
        print('count undecided/unknown', count_unknown)
        print(cnt_unknown)
示例#8
0
class HdtExecutor(Executor):
    def __init__(self, hdt_path: Optional[str] = None,
                 graph: Optional[HDTDocument] = None,
                 redis_client: Optional[redis.Redis] = None):
        self.cache = redis_client
        if graph:
            self.graph = graph
        else:
            self.graph = HDTDocument(hdt_path, map=False, progress=True)

    @cached
    def triples(self, subject: Optional[str]='',
                predicate: Optional[str]='',
                object: Optional[str]='')\
            -> Iterable:
        """
        Generator over the triple store
        Returns triples that match the given triple pattern and the count.
        """
        result_iter, count = self.graph.search_triples(subject, predicate, object)
        return list(result_iter), count

    @cached
    def join(self, patterns: List[Tuple[str, str, str]],
             outvar: Optional[str] = None) -> Iterable:
        """
        Joins a list of basic graph patterns and
        returns triples that match multiple triple patterns.
        """
        patterns = self._verify_uris(patterns)
        result_iter = self.graph.search_join(patterns)
        if outvar:
            return [uri for join_set in result_iter for var, uri in join_set if var == outvar]
        else:
            return list(result_iter)

    @cached
    def subjects(self, predicate=None, object=None) -> Iterable[str]:
        """
        A generator of subjects with the given predicate and object
        """
        return [s for s, p, o in self.triples(predicate=predicate, object=object)[0]]

    @cached
    def predicates(self, subject=None, object=None) -> Iterable[str]:
        """
        A generator of predicates with the given subject and object
        """
        return [p for s, p, o in self.triples(subject=subject, object=object)[0]]

    @cached
    def objects(self, subject=None, predicate=None) -> Iterable[str]:
        """
        A generator of objects with the given subject and predicate
        """
        return [o for s, p, o in self.triples(subject=subject, predicate=predicate)[0]]

    @cached
    def subject_predicates(self, object=None) -> Iterable[Tuple[str, str]]:
        """
        A generator of (subject, predicate) tuples for the given object
        """
        return [(s, p) for s, p, o in self.triples(object=object)[0]]

    @cached
    def subject_objects(self, predicate=None) -> Iterable[Tuple[str, str]]:
        """
        A generator of (subject, object) tuples for the given predicate
        """
        return [(s, o) for s, p, o in self.triples(predicate=predicate)[0]]

    @cached
    def predicate_objects(self, subject=None) -> Iterable[Tuple[str, str]]:
        """
        A generator of (predicate, object) tuples for the given subject
        """
        return [(p, o) for s, p, o in self.triples(subject=subject)[0]]

    def _verify_uris(self, pattern: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]:
        return [(self._verify_uri(s, IdentifierPosition.Subject), p,
                 self._verify_uri(o, IdentifierPosition.Object)) for s, p, o in pattern]

    def _verify_uri(self, uri: str, position: IdentifierPosition) -> Optional[str]:
        if uri[0] == '?':
            return uri

        uri = uri.replace("'", "")
        sub_id = self.graph.convert_term(uri, position)
        if not sub_id:
            uri = ascii(uri.encode())[2:-1].replace("\\x", "x")
            sub_id = self.graph.convert_term(uri, position)

        return uri if sub_id else None
示例#9
0
class SubM:

    # Initializer / Instance Attributes
    def __init__(self, path_hdt=PATH_LOD):
        self.hdt = HDTDocument(path_hdt)
        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)
        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)
        self.graph = nx.DiGraph()
        self.equi_graph = nx.Graph()
        self.diagnosed_relations = {}
        self.diagnosed_classes = {}
        self.leaf_classes = set()

    def setup_graph(self):
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))
        self.graph.add_edges_from(collect_pairs)

    def remove_unnecessary_relations(self):
        for n in self.graph.nodes():
            # test if there is an edge between this node and another node which is also in the Graph
            (eq_triple_ids,
             cardinality) = self.enquiry(query=(n, self.id_equivalentClass, 0),
                                         mode="default")
            for (_, _, m) in eq_triple_ids:
                # test if it is in the Graph
                if m in self.graph.nodes():
                    self.remove_relation(n, m, 'equivalence')
            (eq_triple_ids,
             cardinality) = self.enquiry(query=(0, self.id_equivalentClass, n),
                                         mode="default")
            for (m, _, _) in eq_triple_ids:
                # test if it is in the Graph
                if m in self.graph.nodes():
                    self.remove_relation(m, n, 'equivalence')
        print('total relations diagnosed:', len(self.diagnosed_relations))

    def export_graph(self, export_file=None):
        collect_pairs = self.graph.edges
        if export_file is not None:
            file = open(export_file, 'w', newline='')
            writer = csv.writer(file)
            writer.writerow(["SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
            for (s_id, o_id) in collect_pairs:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                writer.writerow([s_term, s_id, o_term, o_id])

    # define a function subgraph, edges
    # G.edges
    # https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.edges.html#networkx.DiGraph.edges

    def remove_all_two_cycles(self):
        to_remove = set()
        for (l, r) in self.graph.edges():
            if (r, l) in self.graph.edges():
                to_remove.add((l, r))
                to_remove.add((r, l))
        print('there are in total', len(to_remove), ' two-cycle edges removed')
        for (l, r) in to_remove:
            self.graph.remove_edge(l, r)

    def load_manual_decisions(self, file, mode="remove"):
        #if mode = ignore, then we don't do anything to those marked as unknown
        #if mode = remove, then we remove all those that are unknown
        l_two = []
        two = open(file, newline='')
        reader_two = csv.DictReader(two)
        for row in reader_two:
            if (row['SUBJECT_ID'],
                    row['OBJECT_ID']) not in self.diagnosed_relations.keys():
                if (row['SUGGESTION'] == 'remove'):
                    self.remove_relation(row['SUBJECT_ID'],
                                         row['OBJECT_ID'],
                                         comment='remove')
                else:
                    self.diagnosed_relations[
                        row['SUBJECT_ID'],
                        row['OBJECT_ID']] = row['SUGGESTION']
            # l_two.append((row['SUBJECT_ID'], row['OBJECT_ID']))
        # if it is labeled as 'remove' then remove,
        # if it is labeled as 'unknown' then depends on the mode it is in
        print('there are in total ', len(l_two),
              ' relations removed from mannual decisions')
        self.graph.remove_edges_from(l_two)

    def enquiry(self, query, mode="subm"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    def remove_relation(self, sub, sup, comment='remove'):
        if self.graph.has_edge(sub, sup):
            self.graph.remove_edge(sub, sup)
            self.diagnose_relations(sub, sup, comment)

    def remove_relation_from(self, relation_list, comment='remove'):
        for (sub, sup) in relation_list:
            self.remove_relation(sub, sup, comment)

    def diagnose_relations(self, sub, sup, comment='default'):
        self.diagnosed_relations[(sub, sup)] = comment
        # change it to a dictionary?

    def diagnose_class(self, c, comment='default'):
        self.diagnosed_class[c] = comment

        # TODO, split the cases of removal and comment

    def remove_class(self, c, comment='remove'):
        if self.graph.has_node(c):
            self.graph.remove_node(
                c)  # this also removes all the edges related
            self.diagnosed_classes[c] = comment
        # TODO, also remove the related edges connected

    def remove_class_from(self, cs, comment='remove'):
        for c in cs:
            self.remove_class(c, comment)

    def filter_leaf_classes(self):
        count = len(self.diagnosed_classes)

        for c in self.graph.nodes:
            #test if this node is a leaf
            (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c),
                                      mode="default")
            if cardi == 0:
                self.leaf_classes.add(c)
        for c in self.leaf_classes:
            self.remove_class(c)
        print('there are a total of',
              len(self.diagnosed_classes) - count, 'leaf nodes removed')

    def get_domain_from_id(self, id):
        t = self.convert_to_term(id)
        return tldextract.extract(t).domain

    def filter_domain_classes(self, domain):
        filtered = set()
        for c in self.graph.nodes:
            t = self.convert_to_term(c)
            if (domain == tldextract.extract(t).domain):
                filtered.add(c)
        print('a total of ', len(filtered), ' removed w.r.t. domain ', domain)
        self.remove_class_from(list(filtered))

    def filter_reflexsive(self):
        to_remove = set()
        for e in self.graph.edges():
            (l, r) = e
            if l == r:
                to_remove.add(e)
        print('removed reflexive relations', len(to_remove))
        self.graph.remove_edges_from(list(to_remove))

    def print_cycles(self):
        count = 0
        flag = True
        while flag:
            try:
                cycle = nx.find_cycle(self.graph)
                print('find cycle', cycle)
                (l, r) = cycle[0]
                print(self.get_domain_from_id(l))
                self.graph.remove_edges_from(cycle)
            except Exception as e:
                print(e)
                flag = False