def find_new_alignment_rel(self): t = time.time() un_aligned_ent1 = self.kgs.valid_entities1 + self.kgs.test_entities1 un_aligned_ent2 = self.kgs.valid_entities2 + self.kgs.test_entities2 embeds1 = tf.nn.embedding_lookup(self.ent_embeds, un_aligned_ent1).eval(session=self.session) embeds2 = tf.nn.embedding_lookup(self.ent_embeds, un_aligned_ent2).eval(session=self.session) mapping_mat = self.mapping_mat.eval(session=self.session) embeds1 = np.matmul(embeds1, mapping_mat) sim_mat = sim(embeds1, embeds2, normalize=True) print("find new alignment based on relational embeddings:") new_alignment_rel_index = find_potential_alignment_greedily(sim_mat, self.sim_th) check_new_alignment(new_alignment_rel_index) if new_alignment_rel_index is None or len(new_alignment_rel_index) == 0: return False stop = False if len(self.new_alignment_index) == 0: self.new_alignment_index = set(new_alignment_rel_index) elif len(set(new_alignment_rel_index) - self.new_alignment_index) == 0: stop = True else: self.new_alignment_index |= set(new_alignment_rel_index) stop = False check_new_alignment(self.new_alignment_index, context='check total new alignment') self.new_alignment = [(un_aligned_ent1[x], un_aligned_ent2[y]) for (x, y) in self.new_alignment_index] # del embeds1, embeds2, sim_mat print('finding new alignment costs time: {:.4f}s'.format(time.time() - t)) return stop
def predict_entities(self, entities_file_path, output_file_name=None): """ Compute the confidence of given entities if they match or not. Parameters ---------- entities_file_path : str A path pointing to a file formatted as (entity1, entity2) with tab separated (tsv-file). If given, the similarity of the entities is retrieved and returned (or also written to file if output_file_name is given). The parameters top_k and min_sim_value do not play a role, if this parameter is set. output_file_name : str, optional The name of the output file. It is formatted as tsv file with entity1, entity2, confidence. Returns ------- topk_neighbors_w_sim : A list of tuples of form (entity1, entity2, confidence) """ kg1_entities = list() kg2_entities = list() with open(entities_file_path, 'r', encoding='utf-8') as input_file: for line in input_file: entities = line.strip('\n').split('\t') kg1_entities.append(self.kgs.kg1.entities_id_dict[entities[0]]) kg2_entities.append(self.kgs.kg2.entities_id_dict[entities[1]]) kg1_distinct_entities = list(set(kg1_entities)) # make distinct kg2_distinct_entities = list(set(kg2_entities)) kg1_mapping = {entity_id : index for index, entity_id in enumerate(kg1_distinct_entities)} kg2_mapping = {entity_id : index for index, entity_id in enumerate(kg2_distinct_entities)} embeds1 = tf.nn.embedding_lookup(self.ent_embeds, kg1_distinct_entities).eval(session=self.session) embeds2 = tf.nn.embedding_lookup(self.ent_embeds, kg2_distinct_entities).eval(session=self.session) if self.mapping_mat: embeds1 = np.matmul(embeds1, self.mapping_mat.eval(session=self.session)) sim_mat = sim(embeds1, embeds2, metric=self.args.eval_metric, normalize=self.args.eval_norm, csls_k=0) #map back with entities_id_dict to be sure that the right uri is chosen kg1_id_to_uri = {v: k for k, v in self.kgs.kg1.entities_id_dict.items()} kg2_id_to_uri = {v: k for k, v in self.kgs.kg2.entities_id_dict.items()} topk_neighbors_w_sim = [] for entity1_id, entity2_id in zip(kg1_entities, kg2_entities): topk_neighbors_w_sim.append(( kg1_id_to_uri[entity1_id], kg2_id_to_uri[entity2_id], sim_mat[kg1_mapping[entity1_id], kg2_mapping[entity2_id]] )) if output_file_name is not None: #create dir if not existent if not os.path.exists(self.out_folder): os.makedirs(self.out_folder) with open(self.out_folder + output_file_name,'w', encoding='utf8') as file: for entity1, entity2, confidence in topk_neighbors_w_sim: file.write(str(entity1) + "\t" + str(entity2) + "\t" + str(confidence) + "\n") print(self.out_folder + output_file_name, "saved") return topk_neighbors_w_sim
def augment(self): embeds1 = tf.nn.embedding_lookup(self.output_embeds_list[-1], self.ref_ent1) embeds2 = tf.nn.embedding_lookup(self.output_embeds_list[-1], self.ref_ent2) embeds1 = tf.nn.l2_normalize(embeds1, 1) embeds2 = tf.nn.l2_normalize(embeds2, 1) embeds1 = np.array(embeds1.eval(session=self.session)) embeds2 = np.array(embeds2.eval(session=self.session)) print("calculate sim mat...") sim_mat = sim(embeds1, embeds2, csls_k=self.args.csls) sim_mat = scipy.special.expit(sim_mat) th = self.sim_th print("sim th:", th) pair_index = find_alignment(sim_mat, th, 1) return pair_index, sim_mat
def find_alignment(sub_embeds, embeds, indexes, desc_sim_th): desc_sim = sim(sub_embeds, embeds, normalize=True) nearest_k_neighbors = search_nearest_k(desc_sim, 1) alignment = list() for i, j in nearest_k_neighbors: if desc_sim[i, j] >= desc_sim_th: alignment.append((indexes[i], j)) if len(alignment) == 0: print("find no new alignment") return [] # new_alignment_desc_index = find_potential_alignment_greedily(desc_sim, desc_sim_th) # if new_alignment_desc_index is None or len(new_alignment_desc_index) == 0: # print("find no new alignment") # return [] # alignment = [(indexes[i], j) for (i, j) in new_alignment_desc_index] return alignment
def retrieve_topk_alignment(kg1_source_ents, kg1_embeddings, kg2_candidates, kg2_embeddings, session, k=1, metric='inner', normalize=False, csls_k=0, output_path=None): def search_nearest_k(sim_mat, k): assert k > 0 neighbors = list() num = sim_mat.shape[0] for i in range(num): rank = np.argpartition(-sim_mat[i, :], k) pairs = [j for j in itertools.product([i], rank[0:k])] neighbors.extend(pairs) assert len(neighbors) == num * k return neighbors def triple_writer(triples, output_path, separator="\t", linebreak="\n"): file = open(output_path, 'w', encoding='utf8') for s, p, o in triples: file.write( str(s) + separator + str(p) + separator + str(o) + linebreak) file.close() print(output_path, "saved") embeds1 = tf.nn.embedding_lookup(kg1_embeddings, kg1_source_ents).eval(session=session) embeds2 = tf.nn.embedding_lookup(kg2_embeddings, kg2_candidates).eval(session=session) sim_mat = sim(embeds1, embeds2, metric=metric, normalize=normalize, csls_k=csls_k) topk_neighbors = search_nearest_k(sim_mat, k) topk_neighbors_w_sim = [(kg1_source_ents[i], kg2_candidates[j], sim_mat[i, j]) for i, j in topk_neighbors] if output_path is not None: triple_writer(topk_neighbors_w_sim, output_path) return topk_neighbors_w_sim
def predict(self, top_k=1, min_sim_value=None, output_file_name=None): """ Compute pairwise similarity between the two collections of embeddings. Parameters ---------- top_k : int The k for top k retrieval, can be None (but then min_sim_value should be set). min_sim_value : float, optional the minimum value for the confidence. output_file_name : str, optional The name of the output file. It is formatted as tsv file with entity1, entity2, confidence. Returns ------- topk_neighbors_w_sim : A list of tuples of form (entity1, entity2, confidence) """ embeds1 = tf.nn.embedding_lookup(self.ent_embeds, self.kgs.kg1.entities_list).eval(session=self.session) embeds2 = tf.nn.embedding_lookup(self.ent_embeds, self.kgs.kg2.entities_list).eval(session=self.session) if self.mapping_mat: embeds1 = np.matmul(embeds1, self.mapping_mat.eval(session=self.session)) sim_mat = sim(embeds1, embeds2, metric=self.args.eval_metric, normalize=self.args.eval_norm, csls_k=0) # search for correspondences which match top_k and/or min_sim_value matched_entities_indexes = set() if top_k: assert top_k > 0 # top k for entities in kg1 for i in range(sim_mat.shape[0]): for rank_index in np.argpartition(-sim_mat[i, :], top_k)[:top_k]: matched_entities_indexes.add((i, rank_index)) # top k for entities in kg2 for i in range(sim_mat.shape[1]): for rank_index in np.argpartition(-sim_mat[:, i], top_k)[:top_k]: matched_entities_indexes.add((rank_index, i)) if min_sim_value: matched_entities_indexes.intersection(map(tuple, np.argwhere(sim_mat > min_sim_value))) elif min_sim_value: matched_entities_indexes = set(map(tuple, np.argwhere(sim_mat > min_sim_value))) else: raise ValueError("Either top_k or min_sim_value should have a value") #build id to URI map: kg1_id_to_uri = {v: k for k, v in self.kgs.kg1.entities_id_dict.items()} kg2_id_to_uri = {v: k for k, v in self.kgs.kg2.entities_id_dict.items()} topk_neighbors_w_sim = [(kg1_id_to_uri[self.kgs.kg1.entities_list[i]], kg2_id_to_uri[self.kgs.kg2.entities_list[j]], sim_mat[i, j]) for i, j in matched_entities_indexes] if output_file_name is not None: #create dir if not existent if not os.path.exists(self.out_folder): os.makedirs(self.out_folder) with open(self.out_folder + output_file_name,'w', encoding='utf8') as file: for entity1, entity2, confidence in topk_neighbors_w_sim: file.write(str(entity1) + "\t" + str(entity2) + "\t" + str(confidence) + "\n") print(self.out_folder + output_file_name, "saved") return topk_neighbors_w_sim
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize, csls_k, accurate): """ Search alignment with greedy strategy. Parameters ---------- embed1 : matrix_like An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension. embed2 : matrix_like An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension. top_k : list of integers Hits@k metrics for evaluating results. nums_threads : int The number of threads used to search alignment. metric : string The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'. normalize : bool, true or false. Whether to normalize the input embeddings. csls_k : int K value for csls. If k > 0, enhance the similarity by csls. Returns ------- alignment_rest : list, pairs of aligned entities hits1 : float, hits@1 values for alignment results mr : float, MR values for alignment results mrr : float, MRR values for alignment results """ t = time.time() sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) num = sim_mat.shape[0] if nums_threads > 1: hits = [0] * len(top_k) mr, mrr = 0, 0 alignment_rest = set() rests = list() search_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(search_tasks)) for task in search_tasks: mat = sim_mat[task, :] rests.append( pool.apply_async(calculate_rank, (task, mat, top_k, accurate, num))) pool.close() pool.join() for rest in rests: sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get() mr += sub_mr mrr += sub_mrr hits += np.array(sub_hits) alignment_rest |= sub_hits1_rest else: mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)), sim_mat, top_k, accurate, num) assert len(alignment_rest) == num hits = np.array(hits) / num * 100 for i in range(len(hits)): hits[i] = round(hits[i], 3) cost = time.time() - t if accurate: if csls_k > 0: print( "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(csls_k, top_k, hits, mr, mrr, cost)) else: print( "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(top_k, hits, mr, mrr, cost)) else: if csls_k > 0: print( "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s " .format(csls_k, top_k, hits, cost)) else: print("quick results: hits@{} = {}%, time = {:.3f} s ".format( top_k, hits, cost)) hits1 = hits[0] del sim_mat gc.collect() return alignment_rest, hits1, mr, mrr
def stable_alignment(embed1, embed2, metric, normalize, csls_k, nums_threads, cut=100, sim_mat=None): t = time.time() if sim_mat is None: sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) kg1_candidates, kg2_candidates = dict(), dict() num = sim_mat.shape[0] x_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(x_tasks)) rests = list() total = 0 for task in x_tasks: total += len(task) mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_'))) assert total == num pool.close() pool.join() for rest in rests: kg1_candidates = merge_dic(kg1_candidates, rest.get()) sim_mat = sim_mat.T num = sim_mat.shape[0] y_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(y_tasks)) rests = list() for task in y_tasks: mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_'))) pool.close() pool.join() for rest in rests: kg2_candidates = merge_dic(kg2_candidates, rest.get()) # print("kg1_candidates", len(kg1_candidates)) # print("kg2_candidates", len(kg2_candidates)) print( "generating candidate lists costs time {:.3f} s ".format(time.time() - t)) t = time.time() matching = galeshapley(kg1_candidates, kg2_candidates, cut) n = 0 for i, j in matching.items(): if int(i.split('_')[-1]) == int(j.split('_')[-1]): n += 1 cost = time.time() - t print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format( n / len(matching) * 100, cost))