예제 #1
0
def build_content_sim_mh_text(network, mh_signatures):
    def connect(nid1, nid2, score):
        network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score)

    # Materialize signatures for convenience
    mh_sig_obj = []

    content_index = MinHashLSH(threshold=0.7, num_perm=512)

    # Create minhash objects and index
    for nid, mh_sig in mh_signatures:
        mh_obj = MinHash(num_perm=512)
        mh_array = np.asarray(mh_sig, dtype=int)
        mh_obj.hashvalues = mh_array
        content_index.insert(nid, mh_obj)
        mh_sig_obj.append((nid, mh_obj))

    # Query objects
    for nid, mh_obj in mh_sig_obj:
        res = content_index.query(mh_obj)
        for r_nid in res:
            if r_nid != nid:
                connect(nid, r_nid, 1)

    return content_index
예제 #2
0
 def __compare_content_signatures(self, kr_name, signatures):
     positive_matches = []
     for class_name, mh_sig in signatures:
         mh_obj = MinHash(num_perm=512)
         mh_array = np.asarray(mh_sig, dtype=int)
         mh_obj.hashvalues = mh_array
         res = self.content_sim_index.query(mh_obj)
         for r_nid in res:
             (nid, db_name, source_name, field_name) = self.network.get_info_for([r_nid])[0]
             # matching from db attr to name
             matching = ((db_name, source_name, field_name), (kr_name, class_name))
             positive_matches.append(matching)
     return positive_matches
예제 #3
0
    def __build_content_sim(self, threshold):
        # Build a content similarity index
        # Content_sim text relation (minhash-based)
        start_text_sig_sim = time.time()
        st = time.time()
        mh_signatures = self.store_client.get_all_mh_text_signatures()
        et = time.time()
        print("Time to extract minhash signatures from store: {0}".format(str(et - st)))
        print("!!3 " + str(et - st))

        content_index = MinHashLSH(threshold=threshold, num_perm=512)
        mh_sig_obj = []
        # Create minhash objects and index
        for nid, mh_sig in mh_signatures:
            mh_obj = MinHash(num_perm=512)
            mh_array = np.asarray(mh_sig, dtype=int)
            mh_obj.hashvalues = mh_array
            content_index.insert(nid, mh_obj)
            mh_sig_obj.append((nid, mh_obj))
        end_text_sig_sim = time.time()
        print("Total text-sig-sim (minhash): {0}".format(str(end_text_sig_sim - start_text_sig_sim)))
        print("!!4 " + str(end_text_sig_sim - start_text_sig_sim))

        self.content_sim_index = content_index