Пример #1
0
 def find_new_alignment_rel(self):
     t = time.time()
     un_aligned_ent1 = self.kgs.valid_entities1 + self.kgs.test_entities1
     un_aligned_ent2 = self.kgs.valid_entities2 + self.kgs.test_entities2
     embeds1 = tf.nn.embedding_lookup(self.ent_embeds, un_aligned_ent1).eval(session=self.session)
     embeds2 = tf.nn.embedding_lookup(self.ent_embeds, un_aligned_ent2).eval(session=self.session)
     mapping_mat = self.mapping_mat.eval(session=self.session)
     embeds1 = np.matmul(embeds1, mapping_mat)
     sim_mat = sim(embeds1, embeds2, normalize=True)
     print("find new alignment based on relational embeddings:")
     new_alignment_rel_index = find_potential_alignment_greedily(sim_mat, self.sim_th)
     check_new_alignment(new_alignment_rel_index)
     if new_alignment_rel_index is None or len(new_alignment_rel_index) == 0:
         return False
     stop = False
     if len(self.new_alignment_index) == 0:
         self.new_alignment_index = set(new_alignment_rel_index)
     elif len(set(new_alignment_rel_index) - self.new_alignment_index) == 0:
         stop = True
     else:
         self.new_alignment_index |= set(new_alignment_rel_index)
         stop = False
     check_new_alignment(self.new_alignment_index, context='check total new alignment')
     self.new_alignment = [(un_aligned_ent1[x], un_aligned_ent2[y]) for (x, y) in self.new_alignment_index]
     # del embeds1, embeds2, sim_mat
     print('finding new alignment costs time: {:.4f}s'.format(time.time() - t))
     return stop
Пример #2
0
    def predict_entities(self, entities_file_path, output_file_name=None):
        """
        Compute the confidence of given entities if they match or not.
        Parameters
        ----------
        entities_file_path : str
            A path pointing to a file formatted as (entity1, entity2) with tab separated (tsv-file).
            If given, the similarity of the entities is retrieved and returned (or also written to file if output_file_name is given).
            The parameters top_k and min_sim_value do not play a role, if this parameter is set.
        output_file_name : str, optional
            The name of the output file. It is formatted as tsv file with entity1, entity2, confidence.
        Returns
        -------
        topk_neighbors_w_sim : A list of tuples of form (entity1, entity2, confidence)
        """

        kg1_entities = list()
        kg2_entities = list()
        with open(entities_file_path, 'r', encoding='utf-8') as input_file:
            for line in input_file:
                entities = line.strip('\n').split('\t')
                kg1_entities.append(self.kgs.kg1.entities_id_dict[entities[0]])
                kg2_entities.append(self.kgs.kg2.entities_id_dict[entities[1]])
        kg1_distinct_entities = list(set(kg1_entities)) # make distinct
        kg2_distinct_entities = list(set(kg2_entities))

        kg1_mapping = {entity_id : index for index, entity_id in enumerate(kg1_distinct_entities)}
        kg2_mapping = {entity_id : index for index, entity_id in enumerate(kg2_distinct_entities)}

        embeds1 = tf.nn.embedding_lookup(self.ent_embeds, kg1_distinct_entities).eval(session=self.session)
        embeds2 = tf.nn.embedding_lookup(self.ent_embeds, kg2_distinct_entities).eval(session=self.session)

        if self.mapping_mat:
            embeds1 = np.matmul(embeds1, self.mapping_mat.eval(session=self.session))

        sim_mat = sim(embeds1, embeds2, metric=self.args.eval_metric, normalize=self.args.eval_norm, csls_k=0)


        #map back with entities_id_dict to be sure that the right uri is chosen
        kg1_id_to_uri = {v: k for k, v in self.kgs.kg1.entities_id_dict.items()}
        kg2_id_to_uri = {v: k for k, v in self.kgs.kg2.entities_id_dict.items()}

        topk_neighbors_w_sim = []
        for entity1_id, entity2_id in zip(kg1_entities, kg2_entities):
            topk_neighbors_w_sim.append((
                kg1_id_to_uri[entity1_id],
                kg2_id_to_uri[entity2_id],                
                sim_mat[kg1_mapping[entity1_id], kg2_mapping[entity2_id]]
            ))

        if output_file_name is not None:
            #create dir if not existent
            if not os.path.exists(self.out_folder):
                os.makedirs(self.out_folder)
            with open(self.out_folder + output_file_name,'w', encoding='utf8') as file:
                for entity1, entity2, confidence in topk_neighbors_w_sim:
                    file.write(str(entity1) + "\t" + str(entity2) + "\t" + str(confidence) + "\n")
            print(self.out_folder + output_file_name, "saved")

        return topk_neighbors_w_sim
Пример #3
0
 def augment(self):
     embeds1 = tf.nn.embedding_lookup(self.output_embeds_list[-1], self.ref_ent1)
     embeds2 = tf.nn.embedding_lookup(self.output_embeds_list[-1], self.ref_ent2)
     embeds1 = tf.nn.l2_normalize(embeds1, 1)
     embeds2 = tf.nn.l2_normalize(embeds2, 1)
     embeds1 = np.array(embeds1.eval(session=self.session))
     embeds2 = np.array(embeds2.eval(session=self.session))
     print("calculate sim mat...")
     sim_mat = sim(embeds1, embeds2, csls_k=self.args.csls)
     sim_mat = scipy.special.expit(sim_mat)
     th = self.sim_th
     print("sim th:", th)
     pair_index = find_alignment(sim_mat, th, 1)
     return pair_index, sim_mat
Пример #4
0
def find_alignment(sub_embeds, embeds, indexes, desc_sim_th):
    desc_sim = sim(sub_embeds, embeds, normalize=True)
    nearest_k_neighbors = search_nearest_k(desc_sim, 1)
    alignment = list()
    for i, j in nearest_k_neighbors:
        if desc_sim[i, j] >= desc_sim_th:
            alignment.append((indexes[i], j))
    if len(alignment) == 0:
        print("find no new alignment")
        return []
    # new_alignment_desc_index = find_potential_alignment_greedily(desc_sim, desc_sim_th)
    # if new_alignment_desc_index is None or len(new_alignment_desc_index) == 0:
    #     print("find no new alignment")
    #     return []
    # alignment = [(indexes[i], j) for (i, j) in new_alignment_desc_index]
    return alignment
Пример #5
0
def retrieve_topk_alignment(kg1_source_ents,
                            kg1_embeddings,
                            kg2_candidates,
                            kg2_embeddings,
                            session,
                            k=1,
                            metric='inner',
                            normalize=False,
                            csls_k=0,
                            output_path=None):
    def search_nearest_k(sim_mat, k):
        assert k > 0
        neighbors = list()
        num = sim_mat.shape[0]
        for i in range(num):
            rank = np.argpartition(-sim_mat[i, :], k)
            pairs = [j for j in itertools.product([i], rank[0:k])]
            neighbors.extend(pairs)
        assert len(neighbors) == num * k
        return neighbors

    def triple_writer(triples, output_path, separator="\t", linebreak="\n"):
        file = open(output_path, 'w', encoding='utf8')
        for s, p, o in triples:
            file.write(
                str(s) + separator + str(p) + separator + str(o) + linebreak)
        file.close()
        print(output_path, "saved")

    embeds1 = tf.nn.embedding_lookup(kg1_embeddings,
                                     kg1_source_ents).eval(session=session)
    embeds2 = tf.nn.embedding_lookup(kg2_embeddings,
                                     kg2_candidates).eval(session=session)
    sim_mat = sim(embeds1,
                  embeds2,
                  metric=metric,
                  normalize=normalize,
                  csls_k=csls_k)
    topk_neighbors = search_nearest_k(sim_mat, k)
    topk_neighbors_w_sim = [(kg1_source_ents[i], kg2_candidates[j], sim_mat[i,
                                                                            j])
                            for i, j in topk_neighbors]

    if output_path is not None:
        triple_writer(topk_neighbors_w_sim, output_path)

    return topk_neighbors_w_sim
Пример #6
0
    def predict(self, top_k=1, min_sim_value=None, output_file_name=None):
        """
        Compute pairwise similarity between the two collections of embeddings.
        Parameters
        ----------
        top_k : int
            The k for top k retrieval, can be None (but then min_sim_value should be set).
        min_sim_value : float, optional
            the minimum value for the confidence.
        output_file_name : str, optional
            The name of the output file. It is formatted as tsv file with entity1, entity2, confidence.
        Returns
        -------
        topk_neighbors_w_sim : A list of tuples of form (entity1, entity2, confidence)
        """
        embeds1 = tf.nn.embedding_lookup(self.ent_embeds, self.kgs.kg1.entities_list).eval(session=self.session)
        embeds2 = tf.nn.embedding_lookup(self.ent_embeds, self.kgs.kg2.entities_list).eval(session=self.session)

        if self.mapping_mat:
            embeds1 = np.matmul(embeds1, self.mapping_mat.eval(session=self.session))

        sim_mat = sim(embeds1, embeds2, metric=self.args.eval_metric, normalize=self.args.eval_norm, csls_k=0)

        # search for correspondences which match top_k and/or min_sim_value
        matched_entities_indexes = set()
        if top_k:
            assert top_k > 0
            # top k for entities in kg1
            for i in range(sim_mat.shape[0]):
                for rank_index in np.argpartition(-sim_mat[i, :], top_k)[:top_k]:
                    matched_entities_indexes.add((i, rank_index))

            # top k for entities in kg2
            for i in range(sim_mat.shape[1]):
                for rank_index in np.argpartition(-sim_mat[:, i], top_k)[:top_k]:
                    matched_entities_indexes.add((rank_index, i))

            if min_sim_value:
                matched_entities_indexes.intersection(map(tuple, np.argwhere(sim_mat > min_sim_value)))
        elif min_sim_value:
            matched_entities_indexes = set(map(tuple, np.argwhere(sim_mat > min_sim_value)))
        else:
            raise ValueError("Either top_k or min_sim_value should have a value")

        #build id to URI map:
        kg1_id_to_uri = {v: k for k, v in self.kgs.kg1.entities_id_dict.items()}
        kg2_id_to_uri = {v: k for k, v in self.kgs.kg2.entities_id_dict.items()}

        topk_neighbors_w_sim = [(kg1_id_to_uri[self.kgs.kg1.entities_list[i]],
                    kg2_id_to_uri[self.kgs.kg2.entities_list[j]],
                    sim_mat[i, j]) for i, j in matched_entities_indexes]

        if output_file_name is not None:
            #create dir if not existent
            if not os.path.exists(self.out_folder):
                os.makedirs(self.out_folder)
            with open(self.out_folder + output_file_name,'w', encoding='utf8') as file:
                for entity1, entity2, confidence in topk_neighbors_w_sim:
                    file.write(str(entity1) + "\t" + str(entity2) + "\t" + str(confidence) + "\n")
            print(self.out_folder + output_file_name, "saved")
        return topk_neighbors_w_sim
Пример #7
0
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize,
                     csls_k, accurate):
    """
    Search alignment with greedy strategy.

    Parameters
    ----------
    embed1 : matrix_like
        An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
    embed2 : matrix_like
        An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
    top_k : list of integers
        Hits@k metrics for evaluating results.
    nums_threads : int
        The number of threads used to search alignment.
    metric : string
        The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'.
    normalize : bool, true or false.
        Whether to normalize the input embeddings.
    csls_k : int
        K value for csls. If k > 0, enhance the similarity by csls.

    Returns
    -------
    alignment_rest :  list, pairs of aligned entities
    hits1 : float, hits@1 values for alignment results
    mr : float, MR values for alignment results
    mrr : float, MRR values for alignment results
    """
    t = time.time()
    sim_mat = sim(embed1,
                  embed2,
                  metric=metric,
                  normalize=normalize,
                  csls_k=csls_k)
    num = sim_mat.shape[0]
    if nums_threads > 1:
        hits = [0] * len(top_k)
        mr, mrr = 0, 0
        alignment_rest = set()
        rests = list()
        search_tasks = task_divide(np.array(range(num)), nums_threads)
        pool = multiprocessing.Pool(processes=len(search_tasks))
        for task in search_tasks:
            mat = sim_mat[task, :]
            rests.append(
                pool.apply_async(calculate_rank,
                                 (task, mat, top_k, accurate, num)))
        pool.close()
        pool.join()
        for rest in rests:
            sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get()
            mr += sub_mr
            mrr += sub_mrr
            hits += np.array(sub_hits)
            alignment_rest |= sub_hits1_rest
    else:
        mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)),
                                                       sim_mat, top_k,
                                                       accurate, num)
    assert len(alignment_rest) == num
    hits = np.array(hits) / num * 100
    for i in range(len(hits)):
        hits[i] = round(hits[i], 3)
    cost = time.time() - t
    if accurate:
        if csls_k > 0:
            print(
                "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(csls_k, top_k, hits, mr, mrr, cost))
        else:
            print(
                "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(top_k, hits, mr, mrr, cost))
    else:
        if csls_k > 0:
            print(
                "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s "
                .format(csls_k, top_k, hits, cost))
        else:
            print("quick results: hits@{} = {}%, time = {:.3f} s ".format(
                top_k, hits, cost))
    hits1 = hits[0]
    del sim_mat
    gc.collect()
    return alignment_rest, hits1, mr, mrr
Пример #8
0
def stable_alignment(embed1,
                     embed2,
                     metric,
                     normalize,
                     csls_k,
                     nums_threads,
                     cut=100,
                     sim_mat=None):
    t = time.time()
    if sim_mat is None:
        sim_mat = sim(embed1,
                      embed2,
                      metric=metric,
                      normalize=normalize,
                      csls_k=csls_k)

    kg1_candidates, kg2_candidates = dict(), dict()

    num = sim_mat.shape[0]
    x_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(x_tasks))
    rests = list()
    total = 0
    for task in x_tasks:
        total += len(task)
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_')))
    assert total == num
    pool.close()
    pool.join()
    for rest in rests:
        kg1_candidates = merge_dic(kg1_candidates, rest.get())

    sim_mat = sim_mat.T
    num = sim_mat.shape[0]
    y_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(y_tasks))
    rests = list()
    for task in y_tasks:
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_')))
    pool.close()
    pool.join()
    for rest in rests:
        kg2_candidates = merge_dic(kg2_candidates, rest.get())

    # print("kg1_candidates", len(kg1_candidates))
    # print("kg2_candidates", len(kg2_candidates))

    print(
        "generating candidate lists costs time {:.3f} s ".format(time.time() -
                                                                 t))
    t = time.time()
    matching = galeshapley(kg1_candidates, kg2_candidates, cut)
    n = 0
    for i, j in matching.items():
        if int(i.split('_')[-1]) == int(j.split('_')[-1]):
            n += 1
    cost = time.time() - t
    print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format(
        n / len(matching) * 100, cost))