def __init__(self, kg1: KG, kg2: KG, train_links, test_links, valid_links=None, mode='mapping', ordered=True): if mode == "sharing": ent_ids1, ent_ids2 = generate_sharing_id(train_links, kg1.relation_triples_set, kg1.entities_set, kg2.relation_triples_set, kg2.entities_set, ordered=ordered) rel_ids1, rel_ids2 = generate_sharing_id([], kg1.relation_triples_set, kg1.relations_set, kg2.relation_triples_set, kg2.relations_set, ordered=ordered) attr_ids1, attr_ids2 = generate_sharing_id( [], kg1.attribute_triples_set, kg1.attributes_set, kg2.attribute_triples_set, kg2.attributes_set, ordered=ordered) else: ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set, kg1.entities_set, kg2.relation_triples_set, kg2.entities_set, ordered=ordered) rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set, kg1.relations_set, kg2.relation_triples_set, kg2.relations_set, ordered=ordered) attr_ids1, attr_ids2 = generate_mapping_id( kg1.attribute_triples_set, kg1.attributes_set, kg2.attribute_triples_set, kg2.attributes_set, ordered=ordered) id_relation_triples1 = uris_relation_triple_2ids( kg1.relation_triples_set, ent_ids1, rel_ids1) id_relation_triples2 = uris_relation_triple_2ids( kg2.relation_triples_set, ent_ids2, rel_ids2) id_attribute_triples1 = uris_attribute_triple_2ids( kg1.attribute_triples_set, ent_ids1, attr_ids1) id_attribute_triples2 = uris_attribute_triple_2ids( kg2.attribute_triples_set, ent_ids2, attr_ids2) self.uri_kg1 = kg1 self.uri_kg2 = kg2 kg1 = KG(id_relation_triples1, id_attribute_triples1) kg2 = KG(id_relation_triples2, id_attribute_triples2) kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1) kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2) self.uri_train_links = train_links self.uri_test_links = test_links self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1, ent_ids2) self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1, ent_ids2) self.train_entities1 = [link[0] for link in self.train_links] self.train_entities2 = [link[1] for link in self.train_links] self.test_entities1 = [link[0] for link in self.test_links] self.test_entities2 = [link[1] for link in self.test_links] if mode == 'swapping': sup_triples1, sup_triples2 = generate_sup_relation_triples( self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict, kg2.hr_dict) kg1.add_sup_relation_triples(sup_triples1) kg2.add_sup_relation_triples(sup_triples2) sup_triples1, sup_triples2 = generate_sup_attribute_triples( self.train_links, kg1.av_dict, kg2.av_dict) kg1.add_sup_attribute_triples(sup_triples1) kg2.add_sup_attribute_triples(sup_triples2) self.kg1 = kg1 self.kg2 = kg2 self.valid_links = list() self.valid_entities1 = list() self.valid_entities2 = list() if valid_links is not None: self.uri_valid_links = valid_links self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1, ent_ids2) self.valid_entities1 = [link[0] for link in self.valid_links] self.valid_entities2 = [link[1] for link in self.valid_links] self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1 self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2 self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set) self.relations_num = len(self.kg1.relations_set | self.kg2.relations_set) self.attributes_num = len(self.kg1.attributes_set | self.kg2.attributes_set)
def __init__(self, kg1: KG, kg2: KG, train_links, test_links, valid_links=None, mode='mapping', ordered=True, extra_entities_percentage_valid=0.0): # BootEA: swapping (swap entities to generate extra triples), RDGCN: mapping (calibration?? -> min ||e_1-e_2||) if mode == "sharing": ent_ids1, ent_ids2 = generate_sharing_id(train_links, kg1.relation_triples_set, kg1.entities_set, kg2.relation_triples_set, kg2.entities_set, ordered=ordered) rel_ids1, rel_ids2 = generate_sharing_id([], kg1.relation_triples_set, kg1.relations_set, kg2.relation_triples_set, kg2.relations_set, ordered=ordered) attr_ids1, attr_ids2 = generate_sharing_id( [], kg1.attribute_triples_set, kg1.attributes_set, kg2.attribute_triples_set, kg2.attributes_set, ordered=ordered) else: # generate unique id for each entity, relation, attribute (note id for same element is different in two KGs) ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set, kg1.entities_set, kg2.relation_triples_set, kg2.entities_set, ordered=ordered) rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set, kg1.relations_set, kg2.relation_triples_set, kg2.relations_set, ordered=ordered) attr_ids1, attr_ids2 = generate_mapping_id( kg1.attribute_triples_set, kg1.attributes_set, kg2.attribute_triples_set, kg2.attributes_set, ordered=ordered) # convert to triples (id_ent, id_rel, id_ent) id_relation_triples1 = uris_relation_triple_2ids( kg1.relation_triples_set, ent_ids1, rel_ids1) id_relation_triples2 = uris_relation_triple_2ids( kg2.relation_triples_set, ent_ids2, rel_ids2) # convert to triples (id_ent, id_prop, literal) id_attribute_triples1 = uris_attribute_triple_2ids( kg1.attribute_triples_set, ent_ids1, attr_ids1) id_attribute_triples2 = uris_attribute_triple_2ids( kg2.attribute_triples_set, ent_ids2, attr_ids2) self.uri_kg1 = kg1 self.uri_kg2 = kg2 # rebuild kgs using ids, add dict generated before kg1 = KG(id_relation_triples1, id_attribute_triples1) kg2 = KG(id_relation_triples2, id_attribute_triples2) kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1) kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2) self.uri_train_links = train_links self.uri_test_links = test_links # convert link to (id_ent1, id_ent2) self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1, ent_ids2) self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1, ent_ids2) # TODO: here the entities used for testing will always some from the truth only... self.train_entities1 = [link[0] for link in self.train_links] self.train_entities2 = [link[1] for link in self.train_links] self.test_entities1 = [link[0] for link in self.test_links] self.test_entities2 = [link[1] for link in self.test_links] if mode == 'swapping': # generate new triples by swapping (see function for detail) sup_triples1, sup_triples2 = generate_sup_relation_triples( self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict, kg2.hr_dict) # add to the KGs kg1.add_sup_relation_triples(sup_triples1) kg2.add_sup_relation_triples(sup_triples2) # generate new attribute triples by adding all literals of KG1 to corresponding entity in KG2 and viceversa sup_triples1, sup_triples2 = generate_sup_attribute_triples( self.train_links, kg1.av_dict, kg2.av_dict) kg1.add_sup_attribute_triples(sup_triples1) kg2.add_sup_attribute_triples(sup_triples2) self.kg1 = kg1 self.kg2 = kg2 self.valid_links = list() self.valid_entities1 = list() self.valid_entities2 = list() # save validation links and entities (converted to ids) if valid_links is not None: self.uri_valid_links = valid_links self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1, ent_ids2) self.valid_entities1 = [link[0] for link in self.valid_links] self.valid_entities2 = [link[1] for link in self.valid_links] self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1 self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2 # Save the entities which are outside the truth for both KGs self.extra_entities1 = list(self.kg1.entities_set - set(self.useful_entities_list1)) self.extra_entities2 = list(self.kg2.entities_set - set(self.useful_entities_list2)) self.extra_entities_valid1 = random.sample( self.extra_entities1, int(len(self.extra_entities1) * extra_entities_percentage_valid)) self.extra_entities_valid2 = random.sample( self.extra_entities2, int(len(self.extra_entities2) * extra_entities_percentage_valid)) self.extra_entities_test1 = list( set(self.extra_entities1) - set(self.extra_entities_valid1)) self.extra_entities_test2 = list( set(self.extra_entities2) - set(self.extra_entities_valid2)) self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set) self.relations_num = len(self.kg1.relations_set | self.kg2.relations_set) self.attributes_num = len(self.kg1.attributes_set | self.kg2.attributes_set)