def build_content_sim_mh_text(network, mh_signatures): def connect(nid1, nid2, score): network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score) # Materialize signatures for convenience mh_sig_obj = [] content_index = MinHashLSH(threshold=0.7, num_perm=512) # Create minhash objects and index for nid, mh_sig in mh_signatures: mh_obj = MinHash(num_perm=512) mh_array = np.asarray(mh_sig, dtype=int) mh_obj.hashvalues = mh_array content_index.insert(nid, mh_obj) mh_sig_obj.append((nid, mh_obj)) # Query objects for nid, mh_obj in mh_sig_obj: res = content_index.query(mh_obj) for r_nid in res: if r_nid != nid: connect(nid, r_nid, 1) return content_index
def __compare_content_signatures(self, kr_name, signatures): positive_matches = [] for class_name, mh_sig in signatures: mh_obj = MinHash(num_perm=512) mh_array = np.asarray(mh_sig, dtype=int) mh_obj.hashvalues = mh_array res = self.content_sim_index.query(mh_obj) for r_nid in res: (nid, db_name, source_name, field_name) = self.network.get_info_for([r_nid])[0] # matching from db attr to name matching = ((db_name, source_name, field_name), (kr_name, class_name)) positive_matches.append(matching) return positive_matches
def __build_content_sim(self, threshold): # Build a content similarity index # Content_sim text relation (minhash-based) start_text_sig_sim = time.time() st = time.time() mh_signatures = self.store_client.get_all_mh_text_signatures() et = time.time() print("Time to extract minhash signatures from store: {0}".format(str(et - st))) print("!!3 " + str(et - st)) content_index = MinHashLSH(threshold=threshold, num_perm=512) mh_sig_obj = [] # Create minhash objects and index for nid, mh_sig in mh_signatures: mh_obj = MinHash(num_perm=512) mh_array = np.asarray(mh_sig, dtype=int) mh_obj.hashvalues = mh_array content_index.insert(nid, mh_obj) mh_sig_obj.append((nid, mh_obj)) end_text_sig_sim = time.time() print("Total text-sig-sim (minhash): {0}".format(str(end_text_sig_sim - start_text_sig_sim))) print("!!4 " + str(end_text_sig_sim - start_text_sig_sim)) self.content_sim_index = content_index