def build_content_sim_relation_text_lsa(network, signatures): def get_nid_gen(signatures): for nid, sig in signatures: yield nid docs = [] for nid, e in signatures: docs.append(' '.join(e)) # this may become redundant if we exploit the store characteristics tfidf = da.get_tfidf_docs(docs) print("TF-IDF shape before LSA: " + str(tfidf.shape)) st = time.time() tfidf = lsa_dimensionality_reduction(tfidf) et = time.time() print("TF-IDF shape after LSA: " + str(tfidf.shape)) print("Time to compute LSA: {0}".format(str(et - st))) lsh_projections = RandomBinaryProjections('default', 10000) #lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2) nid_gen = get_nid_gen(signatures) # to preserve the order nid -> signature text_engine = index_in_text_engine(nid_gen, tfidf, lsh_projections, tfidf_is_dense=True) nid_gen = get_nid_gen(signatures) # to preserve the order nid -> signature create_sim_graph_text(nid_gen, network, text_engine, tfidf, Relation.CONTENT_SIM, tfidf_is_dense=True)
def build_entity_sim_relation(network, fields, entities): docs = [] for e in entities: if e != "": # Append only non-empty documents docs.append(e) print(str(docs)) if len(docs) > 0: # If documents are empty, then skip this step; not entity similarity will be found tfidf = da.get_tfidf_docs(docs) text_engine = index_in_text_engine( fields, tfidf, rbp) # rbp the global variable create_sim_graph_text(network, text_engine, fields, tfidf, Relation.ENTITY_SIM)
def build_schema_sim_relation(network): def connect(nid1, nid2, score): network.add_relation(nid1, nid2, Relation.SCHEMA_SIM, score) st = time.time() docs = [] for (_, _, field_name, _) in network.iterate_values(): docs.append(field_name) tfidf = da.get_tfidf_docs(docs) et = time.time() print("Time to create docs and TF-IDF: ") print("Create docs and TF-IDF: {0}".format(str(et - st))) nid_gen = network.iterate_ids() num_features = tfidf.shape[1] new_index_engine = LSHRandomProjectionsIndex(num_features) # Index vectors in engine st = time.time() row_idx = 0 for key in nid_gen: sparse_row = tfidf.getrow(row_idx) dense_row = sparse_row.todense() array = dense_row.A[0] row_idx += 1 new_index_engine.index(array, key) et = time.time() print("Total index text: " + str((et - st))) # Create schema_sim links nid_gen = network.iterate_ids() st = time.time() row_idx = 0 for nid in nid_gen: sparse_row = tfidf.getrow(row_idx) dense_row = sparse_row.todense() array = dense_row.A[0] row_idx += 1 N = new_index_engine.query(array) if len(N) > 1: for n in N: (data, key, value) = n if nid != key: connect(nid, key, value) et = time.time() print("Create graph schema: {0}".format(str(et - st))) return new_index_engine
def build_schema_sim_relation_lsa(network, fields): docs = [] for (nid, sn, fn, _, _) in fields: docs.append(fn) tfidf = da.get_tfidf_docs(docs) print("tfidf shape before LSA: " + str(tfidf.shape)) tfidf = lsa_dimensionality_reduction(tfidf) print("tfidf shape after LSA: " + str(tfidf.shape)) text_engine = index_in_text_engine( fields, tfidf, rbp, tfidf_is_dense=True) # rbp the global variable create_sim_graph_text(network, text_engine, fields, tfidf, Relation.SCHEMA_SIM, tfidf_is_dense=True)
def build_content_sim_relation_text(network, signatures): def get_nid_gen(signatures): for nid, sig in signatures: yield nid docs = [] for nid, e in signatures: docs.append(' '.join(e)) # this may become redundant if we exploit the store characteristics tfidf = da.get_tfidf_docs(docs) # rbp = RandomBinaryProjections('default', 1000) lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2) nid_gen = get_nid_gen(signatures) text_engine = index_in_text_engine(nid_gen, tfidf, lsh_projections) nid_gen = get_nid_gen(signatures) create_sim_graph_text(nid_gen, network, text_engine, tfidf, Relation.CONTENT_SIM)
def __find_relation_class_matchings(self): # Retrieve relation names st = time.time() docs = [] names = [] seen_sources = [] for (_, source_name, _, _) in self.network.iterate_values(): if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() docs.append(source_name) names.append(('relation', source_name)) # Retrieve class names for kr_item, kr_handler in self.kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() docs.append(cl) names.append(('class', cl)) tfidf = da.get_tfidf_docs(docs) et = time.time() print("Time to create docs and TF-IDF: ") print("Create docs and TF-IDF: {0}".format(str(et - st))) num_features = tfidf.shape[1] new_index_engine = LSHRandomProjectionsIndex(num_features, projection_count=7) # N2 method """ clean_matchings = [] for i in range(len(docs)): for j in range(len(docs)): sparse_row = tfidf.getrow(i) dense_row = sparse_row.todense() array_i = dense_row.A[0] sparse_row = tfidf.getrow(j) dense_row = sparse_row.todense() array_j = dense_row.A[0] sim = np.dot(array_i, array_j.T) if sim > 0.5: if names[i][0] != names[j][0]: match = names[i][1], names[j][1] clean_matchings.append(match) return clean_matchings """ # Index vectors in engine st = time.time() for idx in range(len(docs)): sparse_row = tfidf.getrow(idx) dense_row = sparse_row.todense() array = dense_row.A[0] new_index_engine.index(array, idx) et = time.time() print("Total index text: " + str((et - st))) # Now query for similar ones: raw_matchings = defaultdict(list) for idx in range(len(docs)): sparse_row = tfidf.getrow(idx) dense_row = sparse_row.todense() array = dense_row.A[0] N = new_index_engine.query(array) if len(N) > 1: for n in N: (data, key, value) = n raw_matchings[idx].append(key) et = time.time() print("Find raw matches: {0}".format(str(et - st))) # Filter matches so that only relation-class ones appear clean_matchings = [] for key, values in raw_matchings.items(): key_kind = names[key][0] for v in values: v_kind = names[v][0] if v_kind != key_kind: match = (names[key][1], names[v][1]) clean_matchings.append(match) return clean_matchings