예제 #1
0
    def __init__(self,
                 kg1: KG,
                 kg2: KG,
                 train_links,
                 test_links,
                 valid_links=None,
                 mode='mapping',
                 ordered=True,
                 extra_entities_percentage_valid=0.0):
        # BootEA: swapping (swap entities to generate extra triples), RDGCN: mapping (calibration?? -> min ||e_1-e_2||)
        if mode == "sharing":
            ent_ids1, ent_ids2 = generate_sharing_id(train_links,
                                                     kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_sharing_id([],
                                                     kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_sharing_id(
                [],
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)
        else:
            # generate unique id for each entity, relation, attribute (note id for same element is different in two KGs)
            ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_mapping_id(
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)

        # convert to triples (id_ent, id_rel, id_ent)
        id_relation_triples1 = uris_relation_triple_2ids(
            kg1.relation_triples_set, ent_ids1, rel_ids1)
        id_relation_triples2 = uris_relation_triple_2ids(
            kg2.relation_triples_set, ent_ids2, rel_ids2)

        # convert to triples (id_ent, id_prop, literal)
        id_attribute_triples1 = uris_attribute_triple_2ids(
            kg1.attribute_triples_set, ent_ids1, attr_ids1)
        id_attribute_triples2 = uris_attribute_triple_2ids(
            kg2.attribute_triples_set, ent_ids2, attr_ids2)

        self.uri_kg1 = kg1
        self.uri_kg2 = kg2

        # rebuild kgs using ids, add dict generated before
        kg1 = KG(id_relation_triples1, id_attribute_triples1)
        kg2 = KG(id_relation_triples2, id_attribute_triples2)
        kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1)
        kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2)

        self.uri_train_links = train_links
        self.uri_test_links = test_links
        # convert link to (id_ent1, id_ent2)
        self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1,
                                          ent_ids2)
        self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1,
                                         ent_ids2)
        # TODO: here the entities used for testing will always some from the truth only...
        self.train_entities1 = [link[0] for link in self.train_links]
        self.train_entities2 = [link[1] for link in self.train_links]
        self.test_entities1 = [link[0] for link in self.test_links]
        self.test_entities2 = [link[1] for link in self.test_links]

        if mode == 'swapping':
            # generate new triples by swapping (see function for detail)
            sup_triples1, sup_triples2 = generate_sup_relation_triples(
                self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict,
                kg2.hr_dict)
            # add to the KGs
            kg1.add_sup_relation_triples(sup_triples1)
            kg2.add_sup_relation_triples(sup_triples2)

            # generate new attribute triples by adding all literals of KG1 to corresponding entity in KG2 and viceversa
            sup_triples1, sup_triples2 = generate_sup_attribute_triples(
                self.train_links, kg1.av_dict, kg2.av_dict)
            kg1.add_sup_attribute_triples(sup_triples1)
            kg2.add_sup_attribute_triples(sup_triples2)

        self.kg1 = kg1
        self.kg2 = kg2

        self.valid_links = list()
        self.valid_entities1 = list()
        self.valid_entities2 = list()
        # save validation links and entities (converted to ids)
        if valid_links is not None:
            self.uri_valid_links = valid_links
            self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1,
                                              ent_ids2)
            self.valid_entities1 = [link[0] for link in self.valid_links]
            self.valid_entities2 = [link[1] for link in self.valid_links]

        self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1
        self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2

        # Save the entities which are outside the truth for both KGs
        self.extra_entities1 = list(self.kg1.entities_set -
                                    set(self.useful_entities_list1))
        self.extra_entities2 = list(self.kg2.entities_set -
                                    set(self.useful_entities_list2))
        self.extra_entities_valid1 = random.sample(
            self.extra_entities1,
            int(len(self.extra_entities1) * extra_entities_percentage_valid))
        self.extra_entities_valid2 = random.sample(
            self.extra_entities2,
            int(len(self.extra_entities2) * extra_entities_percentage_valid))
        self.extra_entities_test1 = list(
            set(self.extra_entities1) - set(self.extra_entities_valid1))
        self.extra_entities_test2 = list(
            set(self.extra_entities2) - set(self.extra_entities_valid2))

        self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set)
        self.relations_num = len(self.kg1.relations_set
                                 | self.kg2.relations_set)
        self.attributes_num = len(self.kg1.attributes_set
                                  | self.kg2.attributes_set)
예제 #2
0
    def __init__(self,
                 kg1: KG,
                 kg2: KG,
                 train_links,
                 test_links,
                 valid_links=None,
                 mode='mapping',
                 ordered=True):
        if mode == "sharing":
            ent_ids1, ent_ids2 = generate_sharing_id(train_links,
                                                     kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_sharing_id([],
                                                     kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_sharing_id(
                [],
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)
        else:
            ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_mapping_id(
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)

        id_relation_triples1 = uris_relation_triple_2ids(
            kg1.relation_triples_set, ent_ids1, rel_ids1)
        id_relation_triples2 = uris_relation_triple_2ids(
            kg2.relation_triples_set, ent_ids2, rel_ids2)

        id_attribute_triples1 = uris_attribute_triple_2ids(
            kg1.attribute_triples_set, ent_ids1, attr_ids1)
        id_attribute_triples2 = uris_attribute_triple_2ids(
            kg2.attribute_triples_set, ent_ids2, attr_ids2)

        self.uri_kg1 = kg1
        self.uri_kg2 = kg2

        kg1 = KG(id_relation_triples1, id_attribute_triples1)
        kg2 = KG(id_relation_triples2, id_attribute_triples2)
        kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1)
        kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2)

        self.uri_train_links = train_links
        self.uri_test_links = test_links
        self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1,
                                          ent_ids2)
        self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1,
                                         ent_ids2)
        self.train_entities1 = [link[0] for link in self.train_links]
        self.train_entities2 = [link[1] for link in self.train_links]
        self.test_entities1 = [link[0] for link in self.test_links]
        self.test_entities2 = [link[1] for link in self.test_links]

        if mode == 'swapping':
            sup_triples1, sup_triples2 = generate_sup_relation_triples(
                self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict,
                kg2.hr_dict)
            kg1.add_sup_relation_triples(sup_triples1)
            kg2.add_sup_relation_triples(sup_triples2)

            sup_triples1, sup_triples2 = generate_sup_attribute_triples(
                self.train_links, kg1.av_dict, kg2.av_dict)
            kg1.add_sup_attribute_triples(sup_triples1)
            kg2.add_sup_attribute_triples(sup_triples2)

        self.kg1 = kg1
        self.kg2 = kg2

        self.valid_links = list()
        self.valid_entities1 = list()
        self.valid_entities2 = list()
        if valid_links is not None:
            self.uri_valid_links = valid_links
            self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1,
                                              ent_ids2)
            self.valid_entities1 = [link[0] for link in self.valid_links]
            self.valid_entities2 = [link[1] for link in self.valid_links]

        self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1
        self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2

        self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set)
        self.relations_num = len(self.kg1.relations_set
                                 | self.kg2.relations_set)
        self.attributes_num = len(self.kg1.attributes_set
                                  | self.kg2.attributes_set)
예제 #3
0
    def __init__(self,
                 kg1: KG,
                 kg2: KG,
                 train_links,
                 test_links,
                 valid_links=None,
                 rel_links=None,
                 kg_test=None,
                 kg_valid=None,
                 mode='mapping',
                 ordered=True,
                 training_data_folder=None,
                 train_kg='kg12'):
        if mode == "sharing":
            ent_ids1, ent_ids2 = generate_sharing_id(
                train_links,
                kg1.relation_triples_set | kg_test.relation_triples_set
                | kg_valid.relation_triples_set,
                kg1.entities_set | kg_test.entities_set
                | kg_valid.entities_set,
                kg2.relation_triples_set | kg_test.relation_triples_set
                | kg_valid.relation_triples_set,
                kg2.entities_set | kg_test.entities_set
                | kg_valid.entities_set,
                ordered=ordered)
            #rel_ids1, rel_ids2 = generate_sharing_id([], kg1.relation_triples_set, kg1.relations_set,
            rel_ids1, rel_ids2 = generate_sharing_id(rel_links,
                                                     kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=False)
            print("Relations and ids: {} {} ".format(rel_ids1, rel_ids2))

        else:
            ent_ids1, ent_ids2 = generate_mapping_id(
                kg1.relation_triples_set | kg_test.relation_triples_set
                | kg_valid.relation_triples_set,
                kg1.entities_set | kg_test.entities_set
                | kg_valid.entities_set,
                kg2.relation_triples_set | kg_test.relation_triples_set
                | kg_valid.relation_triples_set,
                kg2.entities_set | kg_test.entities_set
                | kg_valid.entities_set,
                ordered=ordered)
            rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=False)

        id_relation_triples1 = uris_relation_triple_2ids(
            kg1.relation_triples_set, ent_ids1, rel_ids1)
        id_relation_triples2 = uris_relation_triple_2ids(
            kg2.relation_triples_set, ent_ids2, rel_ids2)

        if train_kg == "kg2":
            id_relation_triples_test = uris_relation_triple_2ids(
                kg_test.relation_triples_set, ent_ids2, rel_ids2)
            id_relation_triples_valid = uris_relation_triple_2ids(
                kg_valid.relation_triples_set, ent_ids2, rel_ids2)
        else:
            id_relation_triples_test = uris_relation_triple_2ids(
                kg_test.relation_triples_set, ent_ids1, rel_ids1)
            id_relation_triples_valid = uris_relation_triple_2ids(
                kg_valid.relation_triples_set, ent_ids1, rel_ids1)

        self.uri_kg1 = kg1
        self.uri_kg2 = kg2
        self.uri_kg_test = kg_test
        self.uri_kg_valid = kg_valid

        #Build KG instances with ids
        kg1 = KG(id_relation_triples1)
        kg2 = KG(id_relation_triples2)
        kg_test = KG(id_relation_triples_test)
        kg_valid = KG(id_relation_triples_valid)
        kg1.set_id_dict(ent_ids1, rel_ids1)
        kg2.set_id_dict(ent_ids2, rel_ids2)

        if train_kg == "kg2":
            kg_test.set_id_dict(ent_ids2, rel_ids2)
            kg_valid.set_id_dict(ent_ids2, rel_ids2)

            self.id2rel = {v: k for k, v in rel_ids2.items()}
            kg2.set_rel_classes(self.id2rel)
            self.rel_classes = kg2.rel_classes
            kg_valid.set_rel_classes(self.id2rel)
            kg_test.set_rel_classes(self.id2rel)
        else:
            kg_test.set_id_dict(ent_ids1, rel_ids1)
            kg_valid.set_id_dict(ent_ids1, rel_ids1)
            self.id2rel = {v: k for k, v in rel_ids1.items()}
            kg1.set_rel_classes(self.id2rel)

            self.rel_classes = kg1.rel_classes
            kg_valid.set_rel_classes(self.id2rel)
            kg_test.set_rel_classes(self.id2rel)

        #For filtering rank
        self.set_multi_entities_dict(id_relation_triples1 +
                                     id_relation_triples_valid +
                                     id_relation_triples_test)
        kg_valid.set_local_multi_entities_dict(self.hr_to_multi_t,
                                               self.tr_to_multi_h)
        kg_test.set_local_multi_entities_dict(self.hr_to_multi_t,
                                              self.tr_to_multi_h)

        #For alignment evaluation
        self.uri_train_links = train_links
        self.uri_test_links = test_links
        self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1,
                                          ent_ids2)
        self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1,
                                         ent_ids2)
        self.train_entities1 = [link[0] for link in self.train_links]
        self.train_entities2 = [link[1] for link in self.train_links]
        self.test_entities1 = [link[0] for link in self.test_links]
        self.test_entities2 = [link[1] for link in self.test_links]
        self.valid_links = list()
        self.valid_entities1 = list()
        self.valid_entities2 = list()
        if valid_links is not None:
            self.uri_valid_links = valid_links
            self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1,
                                              ent_ids2)
            self.valid_entities1 = [link[0] for link in self.valid_links]
            self.valid_entities2 = [link[1] for link in self.valid_links]
        if mode == 'swapping':
            sup_triples1, sup_triples2 = generate_sup_relation_triples(
                self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict,
                kg2.hr_dict)
            kg1.add_sup_relation_triples(sup_triples1)
            kg2.add_sup_relation_triples(sup_triples2)

        self.kg1 = kg1
        self.kg2 = kg2
        self.kg_test = kg_test
        self.kg_valid = kg_valid

        self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1
        self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2

        #
        if train_kg == "kg1":
            self.entities_num = len(self.kg1.entities_set
                                    | self.kg_test.entities_set
                                    | self.kg_valid.entities_set)
            self.relations_num = len(self.kg1.relations_set
                                     | self.kg_test.relations_set
                                     | self.kg_valid.relations_set)
        if train_kg == "kg2":
            self.entities_num = len(self.kg2.entities_set
                                    | self.kg_test.entities_set
                                    | self.kg_valid.entities_set)
            self.relations_num = len(self.kg2.relations_set
                                     | self.kg_test.relations_set
                                     | self.kg_valid.relations_set)
        if train_kg == "kg12":
            self.entities_num = len(self.kg1.entities_set
                                    | self.kg2.entities_set
                                    | self.kg_test.entities_set
                                    | self.kg_valid.entities_set)
            self.relations_num = len(self.kg1.relations_set
                                     | self.kg2.relations_set
                                     | self.kg_test.relations_set
                                     | self.kg_valid.relations_set)

        print(
            "All entities_num: {}, kg1: {}, kg2: {}, kg_test: {}, kg_valid: {}"
            .format(self.entities_num, len(self.kg1.entities_set),
                    len(self.kg2.entities_set), len(self.kg_test.entities_set),
                    len(self.kg_valid.entities_set)))