def build_aliases_for_domain_entity(self): EntityForQA.delete_names_by_source(session=self.session, source="domain entity") client = GraphClient(server_number=4) accessor = DomainEntityAccessor(client) default_accessor = DefaultGraphAccessor(client) domain_entity_list = accessor.get_all_domain_entity() for domain_entity in domain_entity_list: entity = EntityForQA(kg_id=default_accessor.get_id_for_node(node=domain_entity), entity_id=domain_entity['domain_entity_id'], source="domain entity", attr='domain_entity_id', attr_value=domain_entity['domain_entity:name']) self.session.add(entity) self.session.commit()
class SentenceLevelSemanticSearch: SORT_FUNCTION_ENTITIES_BRIDGE = 3 SORT_FUNCTION_AVERAGE_ENTITY_GRAPH_SIMILAR = 4 SORT_FUNCTION_AVERAGE_VECTOR = 2 SORT_FUNCTION_NOT_AVERAGE_GRAPH_VECTOR = 1 SORT_FUNCTION_SELECT_PART_ENTITY_LINK = 5 def __init__(self, ): self._session = None self.kg_models = None self._entity_extractor = None # self._tf_idf_model = None self.qa_searcher = None self.semanticSearchAccessor = None self.defaultAccessor = None self._logger = None def init(self, vector_dir_path="./model/"): self.kg_models = KnowledgeGraphFeafureModels() self.kg_models.init(vector_dir_path=vector_dir_path) self._session = EngineFactory.create_session(echo=False) self._entity_extractor = EntityExtractor() # self._tf_idf_model = TFIDFModel() # self._tf_idf_model.load(dict_type=2) self.qa_searcher = QAEntitySearcher() client = GraphClient(server_number=4) self.semanticSearchAccessor = SemanticSearchAccessor(client) self.defaultAccessor = DefaultGraphAccessor(client) self._logger = Logger("QAResultSearch").get_log() def semantic_search(self, query_text, each_np_candidate_entity_num=50, sort_function=SORT_FUNCTION_SELECT_PART_ENTITY_LINK, sentence_limit=20, weight_context_sim=0.6, weight_graph_sim=0.4, ): try: qa_info_manager = self.get_candidate_sentences(query_text, each_np_candidate_entity_num=each_np_candidate_entity_num) # sentence_list=qa_info_manager.get_candidate_sentence_list() # # entity_for_qa_set # entity_for_qa_set.print_informat() # entity_list = entity_for_qa_set.get_entity_node_list() # chunk_to_related_entity_list_map = entity_for_qa_set.keyword_2_entitynodemap self._logger.info("entity_list =%d sentence_list=%d" % ( qa_info_manager.get_entity_size(), qa_info_manager.get_sentence_size())) # for n in entity_list: # print("entity", n) new_sentence_list = [] # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_NOT_AVERAGE_GRAPH_VECTOR: # new_sentence_list = self.sort_sentence_by_build_graph_vector_for_query_in_semantic_weight(query_text, # sentence_list=sentence_list, # entity_list=entity_list, # weight_context_sim=weight_context_sim, # weight_graph_sim=weight_graph_sim) # # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_AVERAGE_VECTOR: # new_sentence_list = self.sort_sentence_by_build_average_graph_vector_for_query(query_text, # sentence_list=sentence_list, # entity_list=entity_list, # weight_context_sim=weight_context_sim, # weight_graph_sim=weight_graph_sim # ) # # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_ENTITIES_BRIDGE: # new_sentence_list = self.sort_sentence_by_entities_as_bridge(query_text, # sentence_list=sentence_list, # entity_list=entity_list, # weight_context_sim=weight_context_sim, # weight_graph_sim=weight_graph_sim) # # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_AVERAGE_ENTITY_GRAPH_SIMILAR: # new_sentence_list = self.sort_sentence_by_entities_for_graph_similarity_as_bridge(query_text, # sentence_list=sentence_list, # entity_list=entity_list, # weight_context_sim=weight_context_sim, # weight_graph_sim=weight_graph_sim) if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_SELECT_PART_ENTITY_LINK: new_sentence_list = self.sort_sentence_by_select_part_entity_as_bridge(query_text, qa_info_manager=qa_info_manager, weight_context_sim=weight_context_sim, weight_graph_sim=weight_graph_sim, ) result_list = qa_info_manager.fill_api_id_in_result_list(new_sentence_list[:sentence_limit]) self._logger.info("result_list =%d " % len(result_list)) return result_list except Exception: self._logger.exception("----qaexception----") traceback.print_exc() return [] def get_candidate_sentences(self, query_text, each_np_candidate_entity_num=20): chunk_list = self.get_chunk_from_text(query_text) print("chunk num=%d %s" % (len(chunk_list), ",".join(chunk_list))) qa_info_manager = self.search_entity_by_fulltext(chunk_list, each_np_candidate_entity_num) qa_info_manager.start_create_node_info_collection() print("related entity for qa", qa_info_manager) entity_for_qa_list = qa_info_manager.get_all_entity_for_qa_list() print("entity_for_qa_list num=%d" % len(entity_for_qa_list)) sentence_list = self.search_sentence_by_entity_for_qa_list(entity_for_qa_list) print("sentence_list num=%d" % len(sentence_list)) qa_info_manager.add_sentence_node_list(sentence_list) return qa_info_manager def expand_the_chunk_by_words(self, final_chunk_list): final_set = [] for chunk in final_chunk_list: final_set.append(chunk) for word in chunk.split(" "): final_set.append(word) print("word set", final_set) return list(set(final_set)) def get_chunk_from_text(self, text): final_chunk_list = self._entity_extractor.get_all_possible_key_word_from_text(text) return final_chunk_list def search_entity_by_fulltext(self, chunk_list, each_np_candidate_entity_num=20): qa_info_manager = QACacheInfoManager(semanticSearchAccessor=self.semanticSearchAccessor, defaultSearchAccessor=self.defaultAccessor, kg_models=self.kg_models) for chunk in chunk_list: related_entity_list = self.qa_searcher.search_related_entity(chunk, each_np_candidate_entity_num) qa_info_manager.add(chunk, related_entity_list) related_entity_for_api = self.qa_searcher.search_related_entity_for_api(chunk, each_np_candidate_entity_num) qa_info_manager.add(chunk, related_entity_for_api) return qa_info_manager def search_all_entity_by_fulltext_by_half(self, chunk, each_np_candidate_entity_num=20): qa_info_manager = QACacheInfoManager(semanticSearchAccessor=self.semanticSearchAccessor, defaultSearchAccessor=self.defaultAccessor, kg_models=self.kg_models) related_entity_for_api = self.qa_searcher.search_related_entity_for_api(chunk, each_np_candidate_entity_num/2) qa_info_manager.add(chunk, related_entity_for_api) related_entity_list = self.qa_searcher.search_related_entity(chunk, each_np_candidate_entity_num/2) qa_info_manager.add(chunk, related_entity_list) return qa_info_manager def search_sentence_by_entity_for_qa_list(self, entity_for_qa_list): entity_id_string_list = [str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list] entity_id_string_list = list(set(entity_id_string_list)) return self.semanticSearchAccessor.search_sentence_by_entity_list(entity_id_string_list=entity_id_string_list) def get_relation_by_nodes(self, node_list): return self.semanticSearchAccessor.get_nodes_relation(node_list) def sort_sentence_by_entities_as_bridge(self, question, sentence_list, entity_list, weight_context_sim=0.5, weight_graph_sim=0.5 ): self._logger.info("run sort_sentence_by_entities_as_bridge get result=%d" % len(sentence_list)) question_vec = self.kg_models.get_question_entity_vector(question) entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list) sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list) qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_vec, entity_vec_list) qe_sim_np = qe_sim_np / qe_sim_np.sum() kg_context_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_vec_list, sentence_vec_list) kg_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_graph_vec_list, sentence_graph_vec_list) qs_context_sim = weight_context_sim * qe_sim_np * kg_context_sim qs_graph_sim = weight_graph_sim * qe_sim_np * kg_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence["qs_sim"] = sum_sim sentence["qs_context_sim"] = context_sim sentence["qs_graph_sim"] = graph_sim result = [] for sentence in sentence_list: result.append({ "kg_id": self.defaultAccessor.get_id_for_node(sentence), "sentence_id": sentence["sentence_id"], "sentence_type": sentence["sentence_type_code"], "text": sentence["sentence_text"], "qs_sim": sentence["qs_sim"], "qs_context_sim": sentence["qs_context_sim"], "qs_graph_sim": sentence["qs_graph_sim"] }) self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result)) result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True) return result def sort_sentence_by_entities_for_graph_similarity_as_bridge(self, question, sentence_list, entity_list, weight_context_sim=0.5, weight_graph_sim=0.5 ): self._logger.info( "run sort_sentence_by_entities_for_graph_similarity_as_bridge get result=%d" % len(sentence_list)) question_context_vec = self.kg_models.get_question_entity_vector(question) entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list) sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list) qe_sim_np = np.ones((1, len(entity_list))) qe_sim_np = qe_sim_np / qe_sim_np.sum() qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, sentence_vec_list) kg_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_graph_vec_list, sentence_graph_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qe_sim_np * kg_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence["qs_sim"] = sum_sim sentence["qs_context_sim"] = context_sim sentence["qs_graph_sim"] = graph_sim result = [] for sentence in sentence_list: result.append({ "kg_id": self.defaultAccessor.get_id_for_node(sentence), "sentence_id": sentence["sentence_id"], "sentence_type": sentence["sentence_type_code"], "text": sentence["sentence_text"], "qs_sim": sentence["qs_sim"], "qs_context_sim": sentence["qs_context_sim"], "qs_graph_sim": sentence["qs_graph_sim"] }) self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result)) result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True) print("sorted result") for t in result: print("test sort", t) print(result[:100]) return result def sort_sentence_by_build_average_graph_vector_for_query(self, question, sentence_list, entity_list, weight_context_sim=0.5, weight_graph_sim=0.5 ): self._logger.info( "run sort_sentence_by_build_average_graph_vector_for_query get sentence_list=%d" % len(sentence_list)) kg_models = self.kg_models question_context_vec = kg_models.get_question_entity_vector(question) entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list) sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list) entity_list, entity_vec_list, entity_graph_vec_list = self.remove_the_not_related_entity(entity_graph_vec_list, entity_list, entity_vec_list, question_context_vec) query_graph_vector = kg_models.get_question_graph_vector_by_average_all_entities( question=question, entity_graph_vec_list=entity_graph_vec_list) qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, sentence_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector, sentence_graph_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qs_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence["qs_sim"] = sum_sim sentence["qs_context_sim"] = context_sim sentence["qs_graph_sim"] = graph_sim result = [] for sentence in sentence_list: result.append({ "kg_id": self.defaultAccessor.get_id_for_node(sentence), "sentence_id": sentence["sentence_id"], "text": sentence["sentence_text"], "sentence_type": sentence["sentence_type_code"], "qs_sim": sentence["qs_sim"], "qs_context_sim": sentence["qs_context_sim"], "qs_graph_sim": sentence["qs_graph_sim"] }) self._logger.info("run sort_sentence_by_build_average_graph_vector_for_query get result num=%d" % len(result)) result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True) return result def sort_sentence_by_select_part_entity_as_bridge(self, question, qa_info_manager, # sentence_list, # entity_list, weight_context_sim=0.6, weight_graph_sim=0.4, # chunk_to_related_entity_list_map=None, ): self._logger.info( "run sort part entity result=%d" % qa_info_manager.get_sentence_size()) print("entity for node") qa_info_manager.print_entities() print("sentence for node") # qa_info_manager.print_sentences() entity_info_collection = qa_info_manager.get_entity_info_collection() sentence_info_collection = qa_info_manager.get_sentence_info_collection() entity_info_collection.init_vectors(self.kg_models) sentence_info_collection.init_vectors(self.kg_models) sentence_list = sentence_info_collection.get_entity_list() entity_vec_list = entity_info_collection.get_entity_context_list() entity_graph_vec_list = entity_info_collection.get_entity_graph_list() entity_list = entity_info_collection.get_entity_list() sentence_vec_list = sentence_info_collection.get_entity_context_list() sentence_graph_vec_list = sentence_info_collection.get_entity_graph_list() question_context_vec = self.kg_models.get_question_entity_vector(question) entity_list, entity_vec_list, entity_graph_vec_list = self.get_top_related_entity_info_list( question_context_vec=question_context_vec, qa_info_manager=qa_info_manager) # entity_list, entity_vec_list, entity_graph_vec_list = self.remove_the_not_related_entity_by_only_save_one_for_each( # entity_graph_vec_list=entity_graph_vec_list, entity_vec_list=entity_vec_list, entity_list=entity_list, # question_context_vec=question_context_vec, # qa_info_manager=qa_info_manager # # ) qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, sentence_vec_list) # todo:change to the average graph similarity # qs_graph_sim = self.get_graph_similarity_by_average_entity_graph_vector(entity_graph_vec_list, question, # sentence_graph_vec_list) qs_graph_sim = self.get_query_to_sentence_graph_sim_by_select_top_enttity(entity_graph_vec_list, entity_list, entity_vec_list, sentence_graph_vec_list, sentence_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qs_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence["qs_sim"] = sum_sim sentence["qs_context_sim"] = context_sim sentence["qs_graph_sim"] = graph_sim result = [] for sentence in sentence_list: result.append({ "kg_id": self.defaultAccessor.get_id_for_node(sentence), "sentence_id": sentence["sentence_id"], "sentence_type": sentence["sentence_type_code"], "text": sentence["sentence_text"], "qs_sim": sentence["qs_sim"], "qs_context_sim": sentence["qs_context_sim"], "qs_graph_sim": sentence["qs_graph_sim"] }) self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result)) result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True) print(result[:100]) return result def get_graph_similarity_by_average_entity_graph_vector(self, entity_graph_vec_list, question, sentence_graph_vec_list): query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities( question=question, entity_graph_vec_list=entity_graph_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector, sentence_graph_vec_list) return qs_graph_sim def get_graph_similarity_average_entity_graph_vector_similarity(self, entity_graph_vec_list, question, sentence_graph_vec_list): # query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities( # question=question, # entity_graph_vec_list=entity_graph_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(sentence_graph_vec_list, entity_graph_vec_list) return np.mean(qs_graph_sim, axis=1) def get_query_to_sentence_graph_sim_by_select_top_enttity(self, entity_graph_vec_list, entity_list, entity_vec_list, sentence_graph_vec_list, sentence_vec_list): # kg_se_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(sentence_graph_vec_list, # entity_graph_vec_list, # ) kg_se_context_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize( sentence_vec_list, entity_vec_list) # TODO # kg_se_sim = 0.5 * kg_se_graph_sim + 0.5 * kg_se_context_sim kg_se_sim = kg_se_context_sim print("final entity list", len(entity_list), entity_list) select_linking_entity_num = min(5, len(entity_list)) onehot_maxsim_se_matrix = MatrixCalculation.get_most_similar_top_n_entity_as_matrix( top_n=select_linking_entity_num, s_e_similarity_matrix=kg_se_sim) s_query_graph_vec_matrix = onehot_maxsim_se_matrix * np.matrix( entity_graph_vec_list) / select_linking_entity_num qs_graph_sim = MatrixCalculation.compute_cossin_for_one_to_one_in_two_list_normalize(sentence_graph_vec_list, s_query_graph_vec_matrix.getA()) return qs_graph_sim def remove_the_not_related_entity_by_only_save_one_for_each(self, entity_graph_vec_list, entity_list, entity_vec_list, question_context_vec, qa_info_manager): chunk_to_related_entity_list_map = qa_info_manager.keyword_2_entitynodemap qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, entity_vec_list) entity_info_sumary_list = [] for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_np.getA()[0], entity_vec_list, entity_graph_vec_list): print("after first removing sim=", sim, "entity=", entity) entity_info_sumary_list.append({"entity": entity, "sim": sim, "entity_vec": entity_vec, "entity_graph_vec": entity_graph_vec }) entity_info_sumary_list.sort(key=lambda k: (k.get('sim', 0)), reverse=True) valid_word_set = set([]) word_to_related_entity_list_map = {} for chunk, related_entity_list in chunk_to_related_entity_list_map.items(): word = chunk if word not in valid_word_set: valid_word_set.add(word) word_to_related_entity_list_map[word] = related_entity_list else: word_to_related_entity_list_map[word].extend(related_entity_list) # clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_similarity(entity_info_sumary_list, # word_to_related_entity_list_map) # clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_n_similarity(entity_info_sumary_list, word_to_related_entity_list_map) new_entity_list = [] new_entity_graph_vec_list = [] new_entity_vec_list = [] for entity_info_sumary in clean_entity_info_list: new_entity_list.append(entity_info_sumary["entity"]) new_entity_graph_vec_list.append(entity_info_sumary["entity_graph_vec"]) new_entity_vec_list.append(entity_info_sumary["entity_vec"]) print("final save sim=", entity_info_sumary["sim"], "entity=", entity_info_sumary["entity"]) return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list def get_top_related_entity_info_list(self, question_context_vec, qa_info_manager): node_info_collection = qa_info_manager.get_node_info_collection() node_info_collection.fill_each_entity_with_similary_to_question(question_context_vec) node_info_collection.sort_by_qe_sim() # selected_entity_info_list = qa_info_manager.get_top_node_info_by_each_keywords_three_different_type() selected_entity_info_list = qa_info_manager.get_top_node_info_by_each_keywords() new_entity_list = [] new_entity_vec_list = [] new_entity_graph_vec_list = [] for node_info in selected_entity_info_list: new_entity_list.append(node_info.entity_node) new_entity_vec_list.append(node_info.entity_context_vec) new_entity_graph_vec_list.append(node_info.entity_graph_vec) return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list def get_clean_entity_for_each_word_by_max_n_similarity(self, entity_info_sumary_list, word_to_related_entity_list_map): clean_entity_kg_id_list = set([]) print("start get_clean_entity_infi_sumary_list ") word_name_entity_mark = {} for valid_word, related_entity_list in word_to_related_entity_list_map.items(): print("valid word=", valid_word) entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list( entity_info_sumary_list, related_entity_list, 3) # for entity_info in entity_info_list: print("get candidate for word=", valid_word, entity_info_list) word_name_entity_mark[valid_word] = entity_info_list clean_entity_info_list = [] clean_entity_kg_id_list = set([]) for word, entity_info_list in word_name_entity_mark.items(): for entity_info in entity_info_list: kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: clean_entity_info_list.append(entity_info) clean_entity_kg_id_list.add(kg_id) print("valid word=", word, entity_info["entity"]) return clean_entity_info_list def get_clean_entity_for_each_word_by_max_similarity(self, entity_info_sumary_list, word_to_related_entity_list_map): clean_entity_kg_id_list = set([]) print("start get_clean_entity_infi_sumary_list ") word_name_entity_mark = {} for valid_word, related_entity_list in word_to_related_entity_list_map.items(): print("valid word=", valid_word) entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list( entity_info_sumary_list, related_entity_list) for entity_info in entity_info_list: print("get candidate for word=", valid_word, entity_info["entity"]) word_name_entity_mark[valid_word] = entity_info clean_entity_info_list = [] clean_entity_kg_id_list = set([]) for word, entity_info in word_name_entity_mark.items(): kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: clean_entity_info_list.append(entity_info) clean_entity_kg_id_list.add(kg_id) print("valid word=", word, entity_info["entity"]) return clean_entity_info_list def get_clean_entity_infi_sumary_list(self, entity_info_sumary_list, word_to_related_entity_list_map): clean_entity_kg_id_list = set([]) print("start get_clean_entity_infi_sumary_list ") word_name_entity_mark = {} for valid_word, related_entity_list in word_to_related_entity_list_map.items(): print("valid word=", valid_word) entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list( entity_info_sumary_list, related_entity_list) for entity_info in entity_info_list: kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) print("get candidate for word=", valid_word, entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: if valid_word not in word_name_entity_mark.keys(): word_name_entity_mark[valid_word] = entity_info else: old_entity_info = word_name_entity_mark[valid_word] if entity_info["sim"] > old_entity_info["sim"]: word_name_entity_mark[valid_word] = entity_info for seperate_name in valid_word.split(" "): if seperate_name not in word_name_entity_mark.keys(): word_name_entity_mark[seperate_name] = entity_info else: old_entity_info = word_name_entity_mark[seperate_name] if entity_info["sim"] > old_entity_info["sim"]: word_name_entity_mark[seperate_name] = entity_info clean_entity_kg_id_list.add(kg_id) clean_entity_info_list = [] clean_entity_kg_id_list = set([]) for word, entity_info in word_name_entity_mark.items(): kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: clean_entity_info_list.append(entity_info) clean_entity_kg_id_list.add(kg_id) print("valid word=", word, entity_info["entity"]) return clean_entity_info_list def remove_the_not_related_entity(self, entity_graph_vec_list, entity_list, entity_vec_list, question_context_vec): qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, entity_vec_list) print("qeustion to entity similary") new_entity_list = [] new_entity_vec_list = [] new_entity_graph_vec_list = [] qe_sim_clean = [] for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_np.getA()[0], entity_vec_list, entity_graph_vec_list): print("sim=", sim, "entity=", entity) if sim > MIN_RELATED_ENTITY_SIMILARITY: print("adding ", entity) new_entity_list.append(entity) new_entity_vec_list.append(entity_vec) new_entity_graph_vec_list.append(entity_graph_vec) qe_sim_clean.append(sim) entity_list = new_entity_list entity_vec_list = new_entity_vec_list entity_graph_vec_list = new_entity_graph_vec_list new_entity_list = [] new_entity_vec_list = [] new_entity_graph_vec_list = [] entity_info_sumary_list = [] for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_clean, entity_vec_list, entity_graph_vec_list): print("after first removing sim=", sim, "entity=", entity) entity_info_sumary_list.append({"entity": entity, "sim": sim, "entity_vec": entity_vec, "entity_graph_vec": entity_graph_vec }) entity_info_sumary_list.sort(key=lambda k: (k.get('sim', 0)), reverse=True) api_class_name_set = set([]) new_entity_info_sumary_list = [] for entity_info_sumary in entity_info_sumary_list: if entity_info_sumary["entity"].has_label("api"): qualified_name = entity_info_sumary["entity"]["qualified_name"] if qualified_name in api_class_name_set: continue if "(" in qualified_name: simple_name = qualified_name.split("(")[0] class_name = ".".join(simple_name.split(".")[:-1]) if class_name in api_class_name_set: continue else: api_class_name_set.add(class_name) new_entity_info_sumary_list.append(entity_info_sumary) else: api_class_name_set.add(qualified_name) new_entity_info_sumary_list.append(entity_info_sumary) else: new_entity_info_sumary_list.append(entity_info_sumary) for entity_info_sumary in new_entity_info_sumary_list: new_entity_list.append(entity_info_sumary["entity"]) new_entity_graph_vec_list.append(entity_info_sumary["entity_graph_vec"]) new_entity_vec_list.append(entity_info_sumary["entity_vec"]) print("final save sim=", entity_info_sumary["sim"], "entity=", entity_info_sumary["entity"]) return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list def sort_sentence_by_build_graph_vector_for_query_in_semantic_weight(self, question, sentence_list, entity_list, weight_context_sim=0.5, weight_graph_sim=0.5 ): self._logger.info( "run sort_sentence_by_build_graph_vector_for_query_in_semantic_weight get sentence_list=%d" % len( sentence_list)) kg_models = self.kg_models question_context_vec = kg_models.get_question_entity_vector(question) entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list) sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list) query_graph_vector = kg_models.get_question_graph_vector_by_semantic_weight_all_entities( question_context_vec=question_context_vec, entity_context_vec_list=entity_vec_list, entity_graph_vec_list=entity_graph_vec_list) qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, sentence_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector, sentence_graph_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qs_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence["qs_sim"] = sum_sim sentence["qs_context_sim"] = context_sim sentence["qs_graph_sim"] = graph_sim result = [] for sentence in sentence_list: result.append({ "kg_id": self.defaultAccessor.get_id_for_node(sentence), "sentence_id": sentence["sentence_id"], "text": sentence["sentence_text"], "qs_sim": sentence["qs_sim"], "qs_context_sim": sentence["qs_context_sim"], "qs_graph_sim": sentence["qs_graph_sim"] }) self._logger.info( "run sort_sentence_by_build_graph_vector_for_query_in_semantic_weight get result=%d" % len(result)) result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True) return result def get_all_entity(self, entity_for_qa_list): entity_id_string_list = [str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list] entity_id_string_list = list(set(entity_id_string_list)) return self.semanticSearchAccessor.get_all_entity(entity_id_string_list=entity_id_string_list) def get_first_from_entity_info_sumary_list_and_in_related_entity_list(self, entity_info_sumary_list, related_entity_list, top_relate_entity_num=1): return_result_list = [] for entity_info in entity_info_sumary_list: kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) entity = self.get_entity_from_entity_list_by_kgid(kg_id, related_entity_list) if entity is not None: return_result_list.append(entity_info) if len(return_result_list) >= top_relate_entity_num: return return_result_list return [] def get_entity_from_entity_list_by_kgid(self, kg_id, related_entity_list): for related_entity in related_entity_list: if related_entity.kg_id == kg_id: return related_entity return None
class TestGraphClient(TestCase): graphClient = None def setUp(self): self.graphClient = DefaultGraphAccessor(GraphClient()) self.nodeCleaner = NodeCleaner() def test_get_max_id_for_node(self): self.assertEqual(self.graphClient.get_max_id_for_node(), 697753) def test_get_adjacent_node_id_list(self): self.assertEqual(self.graphClient.get_adjacent_node_id_list(66666666), []) correct = [64289, 52628, 62565] self.assertEqual(self.graphClient.get_adjacent_node_id_list(7899), correct) def test_get_node_name_by_id(self): self.assertEqual(self.graphClient.get_node_name_by_id(66666666), None) self.assertEqual(self.graphClient.get_node_name_by_id(3444), "Adobe Device Central") def test_expand_node_for_directly_adjacent_nodes_to_subgraph(self): # self.assertEqual(self.graphClient.expand_node_for_adjacent_nodes_to_subgraph(3444), # "Adobe Device Central") pass def test_find_by_alias_name_property_exactly_match_from_label_limit_one( self): self.assertEqual( self.graphClient.find_one_by_alias_name_property( "entity", "Adobe Device Central"), None) interface = self.graphClient.find_one_by_alias_name_property( "api", "Interface PrintGraphics") self.assertEqual(93008, self.graphClient.get_id_for_node(interface)) def test_find_by_alias_name_property(self): self.assertEqual( self.graphClient.find_by_alias_name_property( "entity", "Adobe Device Central"), []) interfaces = self.graphClient.find_by_alias_name_property( "api", "Interface PrintGraphics") self.assertEqual(len(interfaces), 1) self.assertEqual(93008, self.graphClient.get_id_for_node(interfaces[0])) def test_get_relation_by_relation_id(self): relation = self.graphClient.get_relation_by_relation_id(470129) self.assertIsNone(relation) relation = self.graphClient.get_relation_by_relation_id(122211) self.assertEqual(122211, self.graphClient.get_id_for_node(relation)) self.assertEqual( 91, self.graphClient.get_id_for_node(relation.start_node())) self.assertEqual(29390, self.graphClient.get_id_for_node(relation.end_node())) subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph( 246029, 246030) relations_json = [] for r in subgraph.relationships(): r = { "id": self.graphClient.get_id_for_node(relation), "name": relation.type(), "start_id": self.graphClient.get_start_id_for_relation(relation), "end_id": self.graphClient.get_end_id_for_relation(relation) } print r subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph( 246029, 246033) self.assertEqual(subgraph, None) def test_find_node_by_id(self): node = self.graphClient.find_node_by_id(5444) self.assertEqual(5444, self.graphClient.get_id_for_node(node)) def test_search_nodes_by_name(self): nodes = self.graphClient.search_nodes_by_name("java") count = 0 for n in nodes: count = count + 1 self.assertEqual(10, count) nodes = self.graphClient.search_nodes_by_name("String buffer()") count = 0 for n in nodes: count = count + 1 self.assertEqual(10, count) def test_search_nodes_by_name_in_subgraph(self): subgraph = self.graphClient.search_nodes_by_name_in_subgraph("java") count = 0 for n in subgraph.nodes(): count = count + 1 self.assertEqual(10, count) subgraph = self.graphClient.search_nodes_by_name_in_subgraph( "String buffer()") count = 0 if subgraph is not None: for n in subgraph.nodes(): count = count + 1 self.assertEqual(10, count) def test_get_relations_between_two_nodes_in_subgraph(self): subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph( 48, 3600) self.assertEqual(None, subgraph) subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph( 48, 3643) self.assertEqual(2, len(subgraph.nodes())) self.assertEqual(1, len(subgraph.relationships())) def test_get_relations_between_two_nodes(self): record_list = self.graphClient.get_relations_between_two_nodes( 48, 3600) count = 0 for n in record_list: count = count + 1 self.assertEqual(0, count) record_list = self.graphClient.get_relations_between_two_nodes_in_subgraph( 48, 3643) count = 0 for n in record_list: count = count + 1 self.assertEqual(1, count) def test_cleaner(self): node = self.graphClient.find_node_by_id(444) self.assertEqual(self.nodeCleaner.get_clean_node_name(node), "fake news") node = self.graphClient.find_node_by_id(4444) self.assertEqual(self.nodeCleaner.get_clean_node_name(node), "") self.assertEqual(self.graphClient.get_id_for_node(Node("lll", a=3)), -1) self.assertEqual(self.graphClient.get_id_for_node(node), 4444) def test_get_shortest_path_to_name(self): name = self.graphClient.get_node_name_by_id(8000) subraph = self.graphClient.get_shortest_path_to_name_in_subgraph( 444, name) print subraph def test_get_shortest_path(self): record_list = self.graphClient.get_shortest_path(444, 8000, max_degree=2) self.assertEqual(0, count_record_list(record_list)) record_list = self.graphClient.get_shortest_path(444, 8000) self.assertNotEqual(None, record_list) self.assertEqual(1, count_record_list(record_list)) subgraph = self.graphClient.get_shortest_path_in_subgraph(444, 8000, max_degree=2) self.assertEqual(None, subgraph) subgraph = self.graphClient.get_shortest_path_in_subgraph(444, 8000, max_degree=6) self.assertNotEqual(None, subgraph) self.assertEqual(len(subgraph.nodes()), 7) self.assertEqual(len(subgraph.relationships()), 6) print subgraph def test_get_newest_nodes(self): node_list = self.graphClient.get_newest_nodes(10) self.assertEqual(10, len(node_list)) print(node_list) graphJsonParser = GraphJsonParser() returns = graphJsonParser.parse_node_list_to_json(node_list) print(returns)
def awesome_item_rename_duplicate(awesome_graph_accessor, node_collection): node_list = node_collection.get_all_nodes(1000, ['awesome item']) print len(node_list) node_map = construct_key_count_map(node_list) i = 0 for key in node_map.keys(): if len(node_map[key]) > 1: print key, " ", len(node_map[key]) for each in node_map[key]: print each i += 1 print i node_list_after_step1 = [] for key in node_map.keys(): if len(node_map[key]) > 1: step1_node_list = node_map[key] for node in step1_node_list: if "url" in dict(node): url = node["url"] if "//github.com/" in url: # print url, " ", type(url) github_name = get_name_by_github_url(url) if github_name != "" and github_name.lower() != key.lower(): node["name"] = github_name node_list_after_step1.append(node) else: node_list_after_step1.append(node) else: node_list_after_step1.append(node) for node in node_list_after_step1: print node awesome_graph_accessor.push_node(node) node_map_after_step1 = construct_key_count_map(node_list_after_step1) # i = 0 # for key in node_map_after_step1.keys(): # if len(node_map_after_step1[key]) > 1: # print key, " ", len(node_map_after_step1[key]), " ", node_map_after_step1[key] # i += 1 # print i # with open("node_map_after_step1.txt", 'w') as f: # for key in node_map_after_step1.keys(): # if len(node_map_after_step1[key]) > 1: # nodes_str = key + " " + str(len(node_map_after_step1[key])) + " " + str(node_map_after_step1[key]) # f.write(nodes_str + "\n")z node_list_after_step2 = [] for key in node_map_after_step1.keys(): step2_node_list = node_map_after_step1[key] for i in range(0, len(step2_node_list) - 1): for j in range(i + 1, len(step2_node_list)): if "url" in step2_node_list[i]: url1 = step2_node_list[i]["url"] else: url1 = "" if "url" in step2_node_list[j]: url2 = step2_node_list[j]["url"] else: url2 = "" if "description" in step2_node_list[i]: description1 = step2_node_list[i]["description"] else: description1 = "" if "description" in step2_node_list[j]: description2 = step2_node_list[j]["description"] else: description2 = "" if description1 != "" and description2 != "": desc_sim = description_similarity(description1, description2) if desc_sim >= 0.7: step2_node_list[i].setdefault("duplicate", 1) step2_node_list[j].setdefault("duplicate", 1) else: url_sim = 0 if url1 != "" and url2 != "": url_sim = url_similarity(url1, url2) if url_sim >= 0.8: step2_node_list[i].setdefault("duplicate", 2) step2_node_list[j].setdefault("duplicate", 2) else: node1 = awesome_graph_accessor.find_start_by_relation_type_and_end_url("collect", url1) node2 = awesome_graph_accessor.find_start_by_relation_type_and_end_url("collect", url2) if node1 is not None and node2 is not None: node_id1 = GraphAccessor.get_id_for_node(node1) node_id2 = GraphAccessor.get_id_for_node(node2) if node_id1 == node_id2: step2_node_list[i].setdefault("duplicate", 3) step2_node_list[j].setdefault("duplicate", 3) for each in step2_node_list: node_list_after_step2.append(each) node_map_after_step2 = construct_key_count_map(node_list_after_step2) # i = 0 # for key in node_map_after_step2.keys(): # if len(node_map_after_step2[key]) > 1 and property_in_dict_list("duplicate", node_map_after_step2[key]) is True: # print key, " ", len(node_map_after_step2[key]), " ", node_map_after_step2[key] # i += 1 # print i pending_map = {} duplicate_id_list = [] for key in node_map_after_step2.keys(): temp_list = [] temp_id_list = [] if len(node_map_after_step2[key]) > 1 and property_in_dict_list("duplicate", node_map_after_step2[key]) is True: for each in node_map_after_step2[key]: if "duplicate" in dict(each): temp_list.append(each) temp_id_list.append(DefaultGraphAccessor.get_id_for_node(each)) pending_map.setdefault(key, temp_list) duplicate_id_list.append(temp_id_list) for key in pending_map.keys(): pending_list = rename_property(pending_map[key]) for each in pending_list: print each awesome_graph_accessor.push_node(each) print len(pending_map) with open("duplicate_id_list.txt", 'w') as f: f.write(str(duplicate_id_list))
class EntityVectorGenerator: def __init__(self): self.session = None self.graphClient = None self.entity_vector_model = None def init(self, path="word2vec_api_software_wiki.txt", binary=True): self.session = EngineFactory.create_session() self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4)) self.entity_vector_model = EntityVectorComputeModel() self.entity_vector_model.init_word2vec_model(path=path, binary=binary) print("init complete") def get_content_for_wikidata_node(self, node): content = "" node_dict = dict(node) if 'site:enwiki' in node_dict: title = URLUtil.parse_url_to_title(node["site:enwiki"]) wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title( self.session, title) if wikipedia_doc is not None: return wikipedia_doc.content else: content = content + " " + title property_list = ['labels_en', 'descriptions_en', 'aliases_en'] for key in property_list: if key not in node_dict: continue if type(node[key]) == list: content = content + " " + " ".join(node[key]) else: content = content + " " + node[key] if content is '': return None print("content: ", content) return content def start_generate_wikipedia_vector(self, output_path="wikipedia.plain.txt"): label = "wikipedia" wiki_nodes = self.graphClient.get_all_nodes_by_label(label) data_list = [] for node in wiki_nodes: content = self.get_content_for_wikidata_node(node=node) if content is None: print("------None-----") continue item = { "id": "kg#" + str(self.graphClient.get_id_for_node(node)), "text": content } data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_sentence_vector(self, output_path="sentence.plain.txt"): session = self.session sentence_list = DocumentSentenceText.get_all_valid_sentences(session) data_list = [] for each in sentence_list: if each.id is not None and each.text is not None: item = {"id": each.id, "text": each.text} data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_domain_entity_vector( self, output_path="domain_entity.plain.txt"): domain_entity_data = DomainEntity.get_all_domain_entities(self.session) data_list = [] for each in domain_entity_data: if each.id is not None and each.description is not None: item = { "id": each.id, "text": each.name + " " + each.description } data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_api_entity_vector(self, output_path="api.plain.txt"): api_id_list = APIEntity.get_api_id_and_qualified_name_list( self.session) if api_id_list is not None: data_list = [] for each in api_id_list: api_id = each.id api_name = each.qualified_name try: api_name_simple_name = api_name.split("(")[0].split( ".")[-1] except: api_name_simple_name = "" # api_clean_text_data = APIHTMLText.get_text_by_api_id_and_type(self.session, api_id, # APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION) api_html_text = APIHTMLText.get_html_text_id( self.session, api_id, APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION) if api_html_text is None: continue document_text = DocumentText.get_by_html_text_id( self.session, api_html_text.id) if document_text is None: continue paragraph_text = DocumentParagraphText.get_first_by_doc_id( self.session, document_text.id) if paragraph_text is None: continue if paragraph_text is not None: api_clean_text = paragraph_text.text final_text = api_name_simple_name + " " + api_name + " " + api_clean_text item = {"id": api_id, "text": final_text} data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_paragraph_vector( self, output_path="mean_vector_api_paragraph.plain.txt"): paragraph_list = DocumentParagraphText.get_all_paragraph_text( session=self.session) text_data_set = [] for paragraph in paragraph_list: text = paragraph.text if text is None or len(text.strip()) <= 2: continue text = text.strip() item = {"id": paragraph.id, "text": text} text_data_set.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=text_data_set, output_path=output_path)
class SearchUtil: def __init__(self, graph_client, api_searcher): self.graph_accessor = DefaultGraphAccessor(graph_client) self.api_searcher = api_searcher def search(self, keywords, top_number): result_node_list = [] jobs = [] api_db_search_job = gevent.spawn( self.api_searcher.search_api_entity_with_order, keywords, top_number) jobs.append(api_db_search_job) graph_search_job = gevent.spawn( self.graph_accessor.search_nodes_by_name_in_list, keywords, top_number) jobs.append(graph_search_job) gevent.joinall(jobs, timeout=2000) api_entity_list = api_db_search_job.value api_id_list = [] for api_entity in api_entity_list: api_id_list.append(api_entity.id) api_node_list = self.graph_accessor.get_api_entity_map_to_node( api_id_list) for api_node in api_node_list: if api_node not in result_node_list: result_node_list.append(api_node) graph_node_result_list = graph_search_job.value for graph_node in graph_node_result_list: if graph_node not in result_node_list: result_node_list.append(graph_node) ## todo, change the node search to a more generate way, for example, a scorer is necessary node_score = {} for node in result_node_list: node_id = self.graph_accessor.get_id_for_node(node) node_score[node_id] = 0 for node in result_node_list: node_id = self.graph_accessor.get_id_for_node(node) if node in api_node_list and node in graph_node_result_list: node_score[node_id] = node_score[node_id] + 10 if node.has_label("extended knowledge"): node_score[node_id] = node_score[node_id] - 3 if node.has_label("java class") or node.has_label("wikidata"): node_score[node_id] = node_score[node_id] + 1 if node.has_label("java constructor"): node_score[node_id] = node_score[node_id] - 1 left_nodes = [] for node in result_node_list: left_nodes.append(node) sorted_node_list = [] while len(left_nodes) > 0: max_score = -10000 max_node = None for node in left_nodes: node_id = self.graph_accessor.get_id_for_node(node) if node_score[node_id] > max_score: max_score = node_score[node_id] max_node = node sorted_node_list.append(max_node) left_nodes.remove(max_node) return sorted_node_list[:top_number]
class APISentenceLevelSemanticSearch: def __init__(self, echo=False): self._session = None self.kg_models = None self._entity_extractor = None self.echo = echo # self._tf_idf_model = None self.qa_searcher = None self.semanticSearchAccessor = None self.defaultAccessor = None self._logger = None def init(self, vector_dir_path="./model/"): self.kg_models = KnowledgeGraphFeafureModels() self.kg_models.init(vector_dir_path=vector_dir_path) self._session = EngineFactory.create_session(echo=False) self._entity_extractor = EntityExtractor() # self._tf_idf_model = TFIDFModel() # self._tf_idf_model.load(dict_type=2) self.qa_searcher = QAEntitySearcher() client = GraphClient(server_number=4) self.semanticSearchAccessor = SemanticSearchAccessor(client) self.defaultAccessor = DefaultGraphAccessor(client) self._logger = Logger("QAResultSearch").get_log() def semantic_search( self, query_text, each_np_candidate_entity_num=50, sentence_limit=20, weight_context_sim=0.6, weight_graph_sim=0.4, ): try: qa_info_manager = self.get_candidate_api_entity_list( query_text, each_np_candidate_entity_num=each_np_candidate_entity_num) qa_info_manager = self.sort_api_by_select_part_entity_as_bridge( query_text, qa_info_manager=qa_info_manager, weight_context_sim=weight_context_sim, weight_graph_sim=weight_graph_sim, ) valid_api_info_list = qa_info_manager.get_api_info_collection( ).api_info_list[:100] # for index, api_info in enumerate(valid_api_info_list): # print(index, api_info) sentence_list = self.get_candidate_sentence_by_api_info_list( valid_api_info_list) qa_info_manager.add_sentence_node_list(sentence_list) self.sort_sentence_by_select_part_entity_as_bridge( query_text, qa_info_manager=qa_info_manager, weight_context_sim=weight_context_sim, weight_graph_sim=weight_graph_sim, ) valid_sentence_info_list = qa_info_manager.get_sentence_info_collection( ).sentence_info_list[:sentence_limit] for index, sentence_info in enumerate(valid_sentence_info_list): self.fill_api_id_in_result_list_for_one_sentence_info( sentence_info) # print(index, sentence_info) result = qa_info_manager.get_sentence_info_collection( ).get_top_n_json(top_n=sentence_limit) # print ("%%%%%%%%%%%%%%%%%%%%%%%5") # print result return result except Exception: self._logger.exception("----qaexception----") traceback.print_exc() return [] def semantic_search_summary_by_api_with_class( self, query_text, each_np_candidate_entity_num=50, sentence_limit=100, weight_context_sim=0.6, weight_graph_sim=0.4, top_api_class_num=10, each_api_class_sentence_number=5, ): try: qa_info_manager = self.get_candidate_api_entity_list( query_text, each_np_candidate_entity_num=each_np_candidate_entity_num) qa_info_manager = self.sort_api_by_select_part_entity_as_bridge( query_text, qa_info_manager=qa_info_manager, weight_context_sim=weight_context_sim, weight_graph_sim=weight_graph_sim, ) top_api_class_group = qa_info_manager.get_api_info_collection( ).get_top_api_info_group_by_api_class() result_info_collection_list = [] for k, v_list in top_api_class_group.items(): try: # for v in v_list: # print(k, v) sentence_list = self.get_candidate_sentence_by_api_info_list( v_list) sentence_info_collection = SentenceInfoCollection() for sentence in sentence_list: sentence_info_collection.add_sentence_node( sentence, self.defaultAccessor.get_id_for_node(sentence)) sentence_info_collection.filter_others_sentences() sentence_info_collection = self.sort_sentences_for_one_api_class( question=query_text, qa_info_manager=qa_info_manager, weight_context_sim=weight_context_sim, weight_graph_sim=weight_graph_sim, sentence_info_collection=sentence_info_collection) sub_collection = sentence_info_collection.get_top_n_as_sub_sentence_collection( top_n=each_api_class_sentence_number) result_info_collection_list.append(sub_collection) except: ##How do I convert a String to an int in Java will has problem ## How do I create a file and write to it in Java? traceback.print_exc() result_info_collection_list.sort(key=lambda k: (k.get_sum_qs_sim()), reverse=True) # final_sentence_info_collection = SentenceInfoCollection() # # for collection in result_info_collection_list[:top_api_class]: # for sentence_info in collection.get_all_sentence_infos(): # self.fill_api_id_in_result_list_for_one_sentence_info(sentence_info) # final_sentence_info_collection.add(sentence_info) # # sentence_json_list = final_sentence_info_collection.get_all_as_json() # # return sentence_json_list top_related_api_sentence_info_collection_list = result_info_collection_list[: top_api_class_num] for sentence_in_collection in result_info_collection_list: for sentence_info in sentence_in_collection.get_all_sentence_infos( ): self.fill_api_id_in_result_list_for_one_sentence_info( sentence_info) return top_related_api_sentence_info_collection_list except Exception: self._logger.exception("----qaexception----") traceback.print_exc() return [] def get_candidate_sentence_by_api_info_list(self, api_info_list): id_list = [str(api_info.node_id) for api_info in api_info_list] return self.semanticSearchAccessor.search_sentence_by_directly_to_api( id_list) def get_candidate_api_entity_list(self, query_text, each_np_candidate_entity_num=20): chunk_list = self.get_chunk_from_text(query_text) print("chunk num=%d %s" % (len(chunk_list), ",".join(chunk_list))) qa_info_manager = self.search_entity_by_fulltext( chunk_list, each_np_candidate_entity_num) qa_info_manager.start_create_node_info_collection() # print("related entity for qa", qa_info_manager) entity_info_collection = qa_info_manager.get_entity_info_collection() entity_info_collection.init_vectors(self.kg_models) question_context_vec = self.kg_models.get_question_entity_vector( query_text) self.filter_the_middle_entity_by_small_similarity_to_question_entity( question_context_vec=question_context_vec, qa_info_manager=qa_info_manager) node_kg_id_str_list = entity_info_collection.get_node_kg_id_str_list() api_entity_list = self.search_api_by_node_kg_id_list( node_kg_id_str_list) # print("api_entity_list num=%d" % len(api_entity_list)) qa_info_manager.add_api_node_list(api_entity_list) qa_info_manager.add_api_node_list_from_start_nodes_list() qa_info_manager.api_node_info_collection.init_vectors(self.kg_models) self.filter_api_entity_the_small_similarity_to_question_entity( question_context_vec=question_context_vec, qa_info_manager=qa_info_manager) # qa_info_manager.print_entities() # qa_info_manager.print_api_entities() print("api info collection after filter size=%r" % qa_info_manager.api_node_info_collection.size()) return qa_info_manager def expand_the_chunk_by_words(self, final_chunk_list): final_set = [] for chunk in final_chunk_list: final_set.append(chunk) for word in chunk.split(" "): final_set.append(word) print("word set", final_set) return list(set(final_set)) def get_chunk_from_text(self, text): final_chunk_list = self._entity_extractor.get_all_possible_key_word_from_text( text) final_chunk_list = set(final_chunk_list) if len(final_chunk_list) <= 1: final_chunk_list.add(text) return list(final_chunk_list) def search_entity_by_fulltext(self, chunk_list, each_np_candidate_entity_num=20): qa_info_manager = QACacheInfoManager( semanticSearchAccessor=self.semanticSearchAccessor, defaultSearchAccessor=self.defaultAccessor, kg_models=self.kg_models) for chunk in chunk_list: related_entity_list = self.qa_searcher.search_related_entity( chunk, each_np_candidate_entity_num) qa_info_manager.add(chunk, related_entity_list) related_entity_for_api = self.qa_searcher.search_related_entity_for_api( chunk, each_np_candidate_entity_num) qa_info_manager.add(chunk, related_entity_for_api) return qa_info_manager def search_all_entity_by_fulltext_by_half(self, chunk, each_np_candidate_entity_num=20): qa_info_manager = QACacheInfoManager( semanticSearchAccessor=self.semanticSearchAccessor, defaultSearchAccessor=self.defaultAccessor, kg_models=self.kg_models) related_entity_for_api = self.qa_searcher.search_related_entity_for_api( chunk, each_np_candidate_entity_num / 2) qa_info_manager.add(chunk, related_entity_for_api) related_entity_list = self.qa_searcher.search_related_entity( chunk, each_np_candidate_entity_num / 2) qa_info_manager.add(chunk, related_entity_list) return qa_info_manager def search_sentence_by_entity_for_qa_list(self, entity_for_qa_list): entity_id_string_list = [ str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list ] entity_id_string_list = list(set(entity_id_string_list)) return self.semanticSearchAccessor.search_sentence_by_entity_list( entity_id_string_list=entity_id_string_list) def search_api_by_entity_for_qa_list(self, entity_for_qa_list): entity_id_string_list = [ str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list ] entity_id_string_list = list(set(entity_id_string_list)) return self.semanticSearchAccessor.search_api_by_entity_list( entity_id_string_list=entity_id_string_list) def search_api_by_node_kg_id_list(self, entity_id_string_list): return self.semanticSearchAccessor.search_api_by_entity_list( entity_id_string_list=entity_id_string_list) def get_relation_by_nodes(self, node_list): return self.semanticSearchAccessor.get_nodes_relation(node_list) def sort_api_by_select_part_entity_as_bridge( self, question, qa_info_manager, weight_context_sim=0.6, weight_graph_sim=0.4, ): api_info_collection = qa_info_manager.get_api_info_collection() api_entity_list = api_info_collection.get_entity_list() api_context_vec_list = api_info_collection.get_entity_context_list() api_graph_vec_list = api_info_collection.get_entity_graph_list() question_context_vec = self.kg_models.get_question_entity_vector( question) entity_info_collection = qa_info_manager.get_top_related_entity_info_collection( ) entity_list = entity_info_collection.get_entity_list() entity_context_vec_list = entity_info_collection.get_entity_context_list( ) entity_graph_vec_list = entity_info_collection.get_entity_graph_list() qe_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize( question_context_vec, api_context_vec_list) # todo:change to the average graph similarity qe_graph_sim = self.get_graph_similarity_by_average_entity_graph_vector( entity_graph_vec_list, question, api_graph_vec_list) # qe_graph_sim = self.get_query_to_sentence_graph_sim_by_select_top_enttity(entity_graph_vec_list, entity_list, # entity_context_vec_list, # api_graph_vec_list, # api_context_vec_list) # qe_context_sim = weight_context_sim * qe_context_sim qe_graph_sim = weight_graph_sim * qe_graph_sim qe_sim = qe_context_sim + qe_graph_sim qe_sim = qe_sim.tolist()[0] qe_context_sim = qe_context_sim.tolist()[0] qe_graph_sim = qe_graph_sim.tolist()[0] for api_info, sum_sim, sentence, context_sim, graph_sim in zip( qa_info_manager.get_api_info_collection().api_info_list, qe_sim, api_entity_list, qe_context_sim, qe_graph_sim): api_info.qe_sim = sum_sim api_info.qe_context_sim = context_sim api_info.qe_graph_sim = graph_sim qa_info_manager.get_api_info_collection().sort_by_qe_sim() return qa_info_manager def sort_sentence_by_select_part_entity_as_bridge( self, question, qa_info_manager, weight_context_sim=0.6, weight_graph_sim=0.4, ): self._logger.info("run sort part entity result=%d" % qa_info_manager.get_sentence_size()) sentence_info_collection = qa_info_manager.get_sentence_info_collection( ) sentence_info_collection.init_vectors(self.kg_models) sentence_list = sentence_info_collection.get_entity_list() sentence_vec_list = sentence_info_collection.get_entity_context_list() sentence_graph_vec_list = sentence_info_collection.get_entity_graph_list( ) question_context_vec = self.kg_models.get_question_entity_vector( question) entity_info_collection = qa_info_manager.get_top_related_entity_info_collection( ) entity_list = entity_info_collection.get_entity_list() entity_context_vec_list = entity_info_collection.get_entity_context_list( ) entity_graph_vec_list = entity_info_collection.get_entity_graph_list() # entity_list, entity_vec_list, entity_graph_vec_list = self.remove_the_not_related_entity_by_only_save_one_for_each( # entity_graph_vec_list=entity_graph_vec_list, entity_vec_list=entity_vec_list, entity_list=entity_list, # question_context_vec=question_context_vec, # qa_info_manager=qa_info_manager # # ) qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize( question_context_vec, sentence_vec_list) # todo:change to the average graph similarity qs_graph_sim = self.get_graph_similarity_by_average_entity_graph_vector( entity_graph_vec_list, question, sentence_graph_vec_list) # qs_graph_sim = self.get_query_to_sentence_graph_sim_by_select_top_enttity(entity_graph_vec_list, entity_list, # entity_vec_list, # sentence_graph_vec_list, # sentence_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qs_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sentence_info, sum_sim, sentence, context_sim, graph_sim in zip( sentence_info_collection.sentence_info_list, qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence_info.qs_sim = sum_sim sentence_info.qs_context_sim = context_sim sentence_info.qs_graph_sim = graph_sim sentence_info_collection.sort_by_qs_sim() return qa_info_manager def sort_sentences_for_one_api_class( self, question, qa_info_manager, sentence_info_collection, weight_context_sim=0.6, weight_graph_sim=0.4, ): sentence_info_collection.init_vectors(self.kg_models) sentence_list = sentence_info_collection.get_entity_list() sentence_vec_list = sentence_info_collection.get_entity_context_list() sentence_graph_vec_list = sentence_info_collection.get_entity_graph_list( ) question_context_vec = self.kg_models.get_question_entity_vector( question) entity_info_collection = qa_info_manager.get_top_related_entity_info_collection( ) entity_graph_vec_list = entity_info_collection.get_entity_graph_list() qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize( question_context_vec, sentence_vec_list) qs_graph_sim = self.get_graph_similarity_by_average_entity_graph_vector( entity_graph_vec_list, question, sentence_graph_vec_list) qs_context_sim = weight_context_sim * qs_context_sim qs_graph_sim = weight_graph_sim * qs_graph_sim qs_sim = qs_context_sim + qs_graph_sim qs_sim = qs_sim.tolist()[0] qs_context_sim = qs_context_sim.tolist()[0] qs_graph_sim = qs_graph_sim.tolist()[0] for sentence_info, sum_sim, sentence, context_sim, graph_sim in zip( sentence_info_collection.sentence_info_list, qs_sim, sentence_list, qs_context_sim, qs_graph_sim): sentence_info.qs_sim = sum_sim sentence_info.qs_context_sim = context_sim sentence_info.qs_graph_sim = graph_sim sentence_info_collection.sort_by_qs_sim() return sentence_info_collection def get_graph_similarity_by_average_entity_graph_vector( self, entity_graph_vec_list, question, sentence_graph_vec_list): query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities( question=question, entity_graph_vec_list=entity_graph_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize( query_graph_vector, sentence_graph_vec_list) return qs_graph_sim def get_graph_similarity_average_entity_graph_vector_similarity( self, entity_graph_vec_list, question, sentence_graph_vec_list): # query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities( # question=question, # entity_graph_vec_list=entity_graph_vec_list) qs_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize( sentence_graph_vec_list, entity_graph_vec_list) return np.mean(qs_graph_sim, axis=1) def get_query_to_sentence_graph_sim_by_select_top_enttity( self, entity_graph_vec_list, entity_list, entity_vec_list, sentence_graph_vec_list, sentence_vec_list): # kg_se_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(sentence_graph_vec_list, # entity_graph_vec_list, # ) kg_se_context_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize( sentence_vec_list, entity_vec_list) # TODO # kg_se_sim = 0.5 * kg_se_graph_sim + 0.5 * kg_se_context_sim kg_se_sim = kg_se_context_sim # print("final entity list", len(entity_list), entity_list) select_linking_entity_num = min(5, len(entity_list)) onehot_maxsim_se_matrix = MatrixCalculation.get_most_similar_top_n_entity_as_matrix( top_n=select_linking_entity_num, s_e_similarity_matrix=kg_se_sim) s_query_graph_vec_matrix = onehot_maxsim_se_matrix * np.matrix( entity_graph_vec_list) / select_linking_entity_num qs_graph_sim = MatrixCalculation.compute_cossin_for_one_to_one_in_two_list_normalize( sentence_graph_vec_list, s_query_graph_vec_matrix.getA()) return qs_graph_sim def remove_the_not_related_entity_by_only_save_one_for_each( self, entity_graph_vec_list, entity_list, entity_vec_list, question_context_vec, qa_info_manager): chunk_to_related_entity_list_map = qa_info_manager.keyword_2_entitynodemap qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize( question_context_vec, entity_vec_list) entity_info_sumary_list = [] for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_np.getA()[0], entity_vec_list, entity_graph_vec_list): # print("after first removing sim=", sim, "entity=", entity) entity_info_sumary_list.append({ "entity": entity, "sim": sim, "entity_vec": entity_vec, "entity_graph_vec": entity_graph_vec }) entity_info_sumary_list.sort(key=lambda k: (k.get('sim', 0)), reverse=True) valid_word_set = set([]) word_to_related_entity_list_map = {} for chunk, related_entity_list in chunk_to_related_entity_list_map.items( ): word = chunk if word not in valid_word_set: valid_word_set.add(word) word_to_related_entity_list_map[word] = related_entity_list else: word_to_related_entity_list_map[word].extend( related_entity_list) # clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_similarity(entity_info_sumary_list, # word_to_related_entity_list_map) # clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_n_similarity( entity_info_sumary_list, word_to_related_entity_list_map) new_entity_list = [] new_entity_graph_vec_list = [] new_entity_vec_list = [] for entity_info_sumary in clean_entity_info_list: new_entity_list.append(entity_info_sumary["entity"]) new_entity_graph_vec_list.append( entity_info_sumary["entity_graph_vec"]) new_entity_vec_list.append(entity_info_sumary["entity_vec"]) # print("final save sim=", entity_info_sumary["sim"], "entity=", entity_info_sumary["entity"]) return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list def filter_the_middle_entity_by_small_similarity_to_question_entity( self, question_context_vec, qa_info_manager): node_info_collection = qa_info_manager.get_node_info_collection() print("start middle entity num=%d" % node_info_collection.size()) node_info_collection.fill_each_entity_with_similary_to_question( question_context_vec) node_info_collection.sort_by_qe_sim() node_info_collection.remove_the_entity_by_qe_similarity() qa_info_manager.start_build_top_related_entity_info_list() print("after filter middle entity num=%d" % node_info_collection.size()) def filter_api_entity_the_small_similarity_to_question_entity( self, question_context_vec, qa_info_manager): api_info_collection = qa_info_manager.get_api_info_collection() print("api info collection before filter size=%r" % api_info_collection.size()) api_info_collection.fill_each_entity_with_similary_to_question( question_context_vec) api_info_collection.sort_by_qe_sim() # api_info_collection.remove_the_entity_by_qe_similarity() api_info_collection.remove_the_entity_by_save_some_candidate() api_info_collection.filter_the_api_by_remove_package() print("api info collection after filter size=%r" % api_info_collection.size()) def get_clean_entity_for_each_word_by_max_n_similarity( self, entity_info_sumary_list, word_to_related_entity_list_map): clean_entity_kg_id_list = set([]) print("start get_clean_entity_infi_sumary_list ") word_name_entity_mark = {} for valid_word, related_entity_list in word_to_related_entity_list_map.items( ): # print("valid word=", valid_word) entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list( entity_info_sumary_list, related_entity_list, 3) # for entity_info in entity_info_list: # print("get candidate for word=", valid_word, entity_info_list) word_name_entity_mark[valid_word] = entity_info_list clean_entity_info_list = [] clean_entity_kg_id_list = set([]) for word, entity_info_list in word_name_entity_mark.items(): for entity_info in entity_info_list: kg_id = self.defaultAccessor.get_id_for_node( entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: clean_entity_info_list.append(entity_info) clean_entity_kg_id_list.add(kg_id) print("valid word=", word, entity_info["entity"]) return clean_entity_info_list def get_clean_entity_for_each_word_by_max_similarity( self, entity_info_sumary_list, word_to_related_entity_list_map): clean_entity_kg_id_list = set([]) print("start get_clean_entity_infi_sumary_list ") word_name_entity_mark = {} for valid_word, related_entity_list in word_to_related_entity_list_map.items( ): # print("valid word=", valid_word) entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list( entity_info_sumary_list, related_entity_list) for entity_info in entity_info_list: print("get candidate for word=", valid_word, entity_info["entity"]) word_name_entity_mark[valid_word] = entity_info clean_entity_info_list = [] clean_entity_kg_id_list = set([]) for word, entity_info in word_name_entity_mark.items(): kg_id = self.defaultAccessor.get_id_for_node( entity_info["entity"]) if kg_id not in clean_entity_kg_id_list: clean_entity_info_list.append(entity_info) clean_entity_kg_id_list.add(kg_id) print("valid word=", word, entity_info["entity"]) return clean_entity_info_list def fill_api_id_in_result_list_for_one_sentence_info(self, sentence_info): api_entity = self.semanticSearchAccessor.get_api_entity_for_sentence( sentence_info.node_id) if api_entity is None: return sentence_info.api_kg_id = api_entity["id"] sentence_info.api_id = api_entity["api_id"] sentence_info.api_qualified_name = api_entity["qualified_name"] sentence_info.api_kg_id = self.defaultAccessor.get_id_for_node( api_entity) sentence_info.api_type = api_entity["api_type"] if "api_document_website#1" in dict(api_entity): sentence_info.api_document_website = api_entity[ "api_document_website#1"] def fill_api_id_in_result_list(self, result_sentence_list): self._logger.info("fill api info for =%d" % len(result_sentence_list)) for sentence_dict in result_sentence_list: api_entity = self.semanticSearchAccessor.get_api_entity_for_sentence( sentence_dict["kg_id"]) if api_entity is None: continue sentence_dict["api_kg_id"] = api_entity["id"] sentence_dict["api_id"] = api_entity["api_id"] sentence_dict["api_qualified_name"] = api_entity["qualified_name"] sentence_dict["api_kg_id"] = self.defaultAccessor.get_id_for_node( api_entity) sentence_dict["api_type"] = api_entity["api_type"] if "api_document_website#1" in dict(api_entity): sentence_dict["api_document_website#1"] = api_entity[ "api_document_website#1"] self._logger.info("return result_sentence_list len =%d" % len(result_sentence_list)) return result_sentence_list def get_all_entity(self, entity_for_qa_list): entity_id_string_list = [ str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list ] entity_id_string_list = list(set(entity_id_string_list)) return self.semanticSearchAccessor.get_all_entity( entity_id_string_list=entity_id_string_list) def get_first_from_entity_info_sumary_list_and_in_related_entity_list( self, entity_info_sumary_list, related_entity_list, top_relate_entity_num=1): return_result_list = [] for entity_info in entity_info_sumary_list: kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"]) entity = self.get_entity_from_entity_list_by_kgid( kg_id, related_entity_list) if entity is not None: return_result_list.append(entity_info) if len(return_result_list) >= top_relate_entity_num: return return_result_list return [] def get_entity_from_entity_list_by_kgid(self, kg_id, related_entity_list): for related_entity in related_entity_list: if related_entity.kg_id == kg_id: return related_entity return None
class WikiAliasDBImporter: def __init__(self): self.graphClient = None self.session = None def init(self): self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4)) self.session = EngineFactory.create_session() print("init complete") def clean_table(self): WikipediaEntityName.delete_all(self.session) WikipediaEntityNameToWikipediaMapping.delete_all(self.session) print("delete all exist table") def start_import_wiki_aliases_to_db(self): label = "wikipedia" wiki_nodes = self.graphClient.get_all_nodes_by_label(label) for node in wiki_nodes: node_id = self.graphClient.get_id_for_node(node) # print ('node_id: %r', node_id) # name, site_enwiki, labels_ = '' name_set = set([]) if 'name' in dict(node): # print ("name: %r", node['name']) if isinstance(node['name'], list): for each in node['name']: name_set.add(each) else: name_set.add(node['name']) if 'site:enwiki' in dict(node): # print ('site_enwiki: %s', node['site:enwiki']) if isinstance(node['site:enwiki'], list): for each in node['site:enwiki']: title = URLUtil.parse_url_to_title(each) # print ('site_name: %r', title) name_set.add(title) else: title = URLUtil.parse_url_to_title(node["site:enwiki"]) # print ('site_name: %r', title) name_set.add(title) if 'labels_en' in dict(node): # print( "labels_en: ", node['labels_en']) if isinstance(node['labels_en'], list): for each in node['labels_en']: name_set.add(each) else: name_set.add(node['labels_en']) if 'aliases_en' in dict(node): # print("aliases_en: ", node['aliases_en']) for each in node['aliases_en']: name_set.add(each) # print (name_set) for name in name_set: try: wikipedia_entity_name = WikipediaEntityName( node_id, str(name)) wikipedia_entity_name.find_or_create(self.session, autocommit=True) except Exception: traceback.print_exc() # self.session.commit() self.session.commit() def start_generate_wiki_entity_text_map(self): wikipedia_entity_name_data = WikipediaEntityName.get_all_wikipedia_names( self.session) kg_id_list = set([]) for each in wikipedia_entity_name_data: if each is not None: kg_id_list.add(each.kg_id) # print kg_id_list for kg_id in kg_id_list: node = self.graphClient.find_node_by_id(kg_id) if node is not None: if "site:enwiki" in dict(node): title = URLUtil.parse_url_to_title(node["site:enwiki"]) wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title( self.session, title) if wikipedia_doc is not None: wikipedia_id = wikipedia_doc.id wiki_name_to_wikipedia_mapping = WikipediaEntityNameToWikipediaMapping( kg_id, wikipedia_id) wiki_name_to_wikipedia_mapping.find_or_create( self.session, autocommit=False) self.session.commit() def start_import(self): self.init() self.clean_table() self.start_import_wiki_aliases_to_db() self.start_generate_wiki_entity_text_map()
class AnswerGenerator: stackoverflow_word2vec_model = None # path = 'E:\Research\word2vec\model\so_word_embedding_model' # path = 'model/so_word_embedding_model' def __init__(self): #file_dir = os.path.split(os.path.realpath(__file__))[0] #self.path = os.path.join(file_dir, self.path) #self.stackoverflow_word2vec_model = Word2Vec.load(self.path) self.graph_operator = DefaultGraphAccessor(GraphClient()) print "done load word2vec" def generate_answer_set(self, question, candidate_answer_set): ''' generate the answer set from candidate_answer_set for one question, the answer set may container some answer,but the answer ranked by the possibility to be right answer the first answer is the most likely answer :param question: question object :param candidate_answer_set: candidate answer set :return: the answer set contain a set of answers ''' # todo complete this answer generator answer_list = [] id_node_dict = self.id_node_dict(candidate_answer_set) id_relation_dict = self.id_relation_dict(candidate_answer_set) print question.get_keywords() #print self.print_data(candidate_answer_set) question_keywords = question.get_keywords() keywords = [] for each in question_keywords: each = each.lower() if each.strip().find(" ") != -1: keywords.extend(each.split()) else: keywords.append(each) print keywords question_vec = self.generate_question_vector(keywords) nodes_from_relation = self.relation_similarity(candidate_answer_set, keywords, question_vec, id_relation_dict) nodes_result_vecs = self.generate_candidate_vectors( nodes_from_relation) nodes_result = { key: self.question_answer_similarity(question_vec, nodes_result_vecs.get(key)) for key in nodes_result_vecs.keys() } result, others = self.simple_match( keywords, candidate_answer_set.candidate_graph.nodes()) match_result_vecs = self.generate_candidate_vectors(result) match_result = { key: self.question_answer_similarity(question_vec, match_result_vecs.get(key)) for key in match_result_vecs.keys() } others_vecs = self.generate_candidate_vectors(others) similar_result = { key: self.question_answer_similarity(question_vec, others_vecs.get(key)) * 0.8 for key in others_vecs.keys() } '''print similar_result sorted_similar_result = self.result_sort(similar_result) sorted_match_result = self.result_sort(match_result) print sorted_similar_result print sorted_match_result sorted_result = sorted_match_result + sorted_similar_result print sorted_result sorted_result = self.list_sort(sorted_result) print sorted_result''' full_result = dict(nodes_result.items() + match_result.items() + similar_result.items()) sorted_result = self.result_sort(full_result) print sorted_result property_sililar_result = self.property_sililarity( sorted_result, id_node_dict, question_vec) sorted_property_result = self.result_sort(property_sililar_result) print sorted_property_result for i in range(0, 5): (id, score) = sorted_result[i] node = id_node_dict.get(id) node_list = [] node_list.append(node) answer_text = self.get_entity_name(node) answer = Answer(answer_text, node_list, score) answer_list.append(answer) return AnswerSet(answer_list=answer_list) def property_sililarity(self, sorted_result, id_node_dict, question_vec): result = {} for i in range(0, 10): (id, score) = sorted_result[i] node = id_node_dict.get(id) properties = dict(node) cos_similarity = 0 for key in properties: key_lists = self.extract_names(key) each_cos_similarity = 0 for each in key_lists: try: vec = [ value for value in self.stackoverflow_word2vec_model.wv[each] ] each_cos_similarity += self.question_answer_similarity( question_vec, vec) except KeyError as ke: print KeyError, ":", str(ke) cos_similarity += (each_cos_similarity / len(key_lists)) property_score = score * cos_similarity result.setdefault(id, property_score) return result def calculate_cosine_similarity(self, vec1, vec2): if not len(vec1) == len(vec2): return 0 if vec1 == [0 for i in range(0, len(vec1)) ] or vec2 == [0 for i in range(0, len(vec2))]: return 0 numerator = sum([vec1[i] * vec2[i] for i in range(0, len(vec1))]) vec1_dis = math.sqrt( sum([vec1[i] * vec1[i] for i in range(0, len(vec1))])) vec2_dis = math.sqrt( sum([vec2[i] * vec2[i] for i in range(0, len(vec2))])) cos_sim = numerator / (vec1_dis * vec2_dis) return cos_sim def question_answer_similarity(self, question_vecs, answer_vec): similarity = 0 for each_keyword in question_vecs: keyword_vec = question_vecs[each_keyword] similarity += self.calculate_cosine_similarity( keyword_vec, answer_vec) return similarity / len(question_vecs) def generate_question_vector(self, question_keywords): vec = [0 for i in range(0, 400)] question_word_vecs = {} for word in question_keywords: try: vec = [ value for value in self.stackoverflow_word2vec_model.wv[word] ] # question_vec = [(a + b) for a, b in zip(question_vec, vec)] except KeyError as ke: print KeyError, ":", str(ke) question_word_vecs.setdefault(word, vec) return question_word_vecs def generate_candidate_vectors(self, others): candidate_answer_vecs = {} for node in others: node_id = self.graph_operator.get_id_for_node(node) name = self.get_entity_name(node) name_list = name.split() name_vec = [0 for i in range(0, 400)] for name in name_list: try: temp_vec = self.stackoverflow_word2vec_model.wv[name] # print temp_vec name_vec = [a + b for a, b in zip(name_vec, temp_vec)] except KeyError as ke: print KeyError, ":", str(ke) candidate_answer_vecs.setdefault(node_id, name_vec) return candidate_answer_vecs def relation_similarity(self, candidate_answer_set, keywords, question_vec, id_relation_dict): node_result = [] relation_id_score = {} for relation in candidate_answer_set.candidate_graph.relationships(): if self.relation_simple_match(relation, keywords): node_result.append(relation.start_node()) node_result.append(relation.end_node()) else: relation_id = self.graph_operator.get_id_for_node(relation) score = self.relation_cos_similarity(relation, question_vec) relation_id_score.setdefault(relation_id, score) sorted_relation_id_score = self.result_sort(relation_id_score) for i in range(0, 3): id, score = sorted_relation_id_score[i] relation = id_relation_dict.get(id) node_result.append(relation.start_node()) node_result.append(relation.end_node()) return node_result def get_node_by_id(self, node_id_list, id_node_dict): result = [] for each_id in node_id_list: node = id_node_dict.get(each_id) result.append(node) return result def relation_simple_match(self, relation, question_keywords): relation_names = self.extract_names(relation.type()) for each_name in relation_names: if each_name != '' and each_name.lower() in question_keywords: return True return False def relation_cos_similarity(self, relation, question_vec): relation_names = self.extract_names(relation.type()) each_cos_similarity = 0 for each_name in relation_names: try: vec = [ value for value in self.stackoverflow_word2vec_model.wv[each_name] ] each_cos_similarity += self.question_answer_similarity( question_vec, vec) except KeyError as ke: print KeyError, ":", str(ke) return each_cos_similarity / len(relation_names) def simple_match(self, question_keywords, node_list): result = [] for node in node_list: name = self.get_entity_name(node) temp_name = name.lower() print name if temp_name != '' and temp_name in question_keywords: result.append(node) candidate_set = set(node_list) result_set = set(result) other_set = candidate_set - result_set others = list(other_set) '''print "result nodes: " for node in result: name = self.get_entity_name(node) print name''' return result, others def result_sort(self, result_dict): return sorted(result_dict.iteritems(), key=lambda asd: asd[1], reverse=True) def list_sort(self, result_list): return sorted(result_list, key=lambda asd: asd[1], reverse=True) def get_entity_name(self, node): if node.has_key('name'): name = node.get('name') elif node.has_key('labels_en'): name = node.get('labels_en') else: name = '' return name def extract_names(self, name): result = [] if name.find("_") != -1: result.extend(name.split("_")) elif name.find(" ") != -1: result.extend(name.split(" ")) else: result.append(name) return result def id_node_dict(self, candidate_answer_set): result = {} for node in candidate_answer_set.candidate_graph.nodes(): id = self.graph_operator.get_id_for_node(node) result.setdefault(id, node) return result def id_relation_dict(self, candidate_answer_set): result = {} for relation in candidate_answer_set.candidate_graph.relationships(): id = self.graph_operator.get_id_for_node(relation) result.setdefault(id, relation) return result def print_data(self, candidate_answer_set): print "candidate_graph data:" for node in candidate_answer_set.candidate_graph.nodes(): name = self.get_entity_name(node) print name print "important_kernel_nodes_subgraph data:" for node in candidate_answer_set.important_kernel_nodes_subgraph.nodes( ): name = self.get_entity_name(node) print name print "normal_condition_nodes_subgraph data:" for node in candidate_answer_set.normal_condition_nodes_subgraph.nodes( ): name = self.get_entity_name(node) print name print "key_word_path_subgraph data:" for node in candidate_answer_set.key_word_path_subgraph.nodes(): name = self.get_entity_name(node) print name print "important_kernel_nodes_expand_subgraph data:" for node in candidate_answer_set.important_kernel_nodes_expand_subgraph.nodes( ): name = self.get_entity_name(node) print name print "normal_condition_node_expand_subgraph data:" for node in candidate_answer_set.normal_condition_node_expand_subgraph.nodes( ): name = self.get_entity_name(node) print name
class KnowledgeGraphFeafureModels: WORD2VEC_FILE_LIST = {"domain entity": "domain_entity.binary.txt", "api": "api.binary.txt", "wikidata": "wikipedia.binary.txt", "sentence": "sentence.binary.txt", "graph": "graph_vector.binary.txt", "word2vec": "word2vec_api_software_wiki.txt" } WORDVECTOR_KEY_MAP = { "domain entity": "domain_entity_id", "api": "api_id", "sentence": "sentence_id", "wikidata": "wd_kg_id" } def __init__(self): self._api_wv = None self._domain_entity_wv = None self._wiki_wv = None self._sentence_wv = None self._graph_wv = None self._entity_vector_compute_model = None self.NP_VECTOR_NOT_EXIST = None self.defaultAccessor = None def init(self, vector_dir_path="./model/", ): time_start = time.time() print("start init the model=%d" % time_start) client = GraphClient(server_number=4) self.defaultAccessor = DefaultGraphAccessor(client) self._api_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["api"], binary=True) self._domain_entity_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["domain entity"], binary=True) self._wiki_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["wikidata"], binary=True) self._sentence_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["sentence"], binary=True) self._graph_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["graph"], binary=True) self._entity_vector_compute_model = EntityVectorComputeModel() self._entity_vector_compute_model.init_word2vec_model(vector_dir_path + self.WORD2VEC_FILE_LIST["word2vec"], binary=True) self.NP_VECTOR_NOT_EXIST = np.zeros(128) self.NP_VECTOR_NOT_EXIST[1] = 1e-07 time_end = time.time() print("init complete in %d" % (time_end - time_start)) def get_entity_vec(self, entity_node): entity_type = "" wv_model = None if entity_node.has_label("wikidata"): entity_node["wd_kg_id"] = "kg#" + str(self.defaultAccessor.get_id_for_node(entity_node)) entity_type = "wikidata" wv_model = self._wiki_wv if entity_node.has_label("sentence"): entity_type = "sentence" wv_model = self._sentence_wv if entity_node.has_label("domain entity"): entity_type = "domain entity" wv_model = self._domain_entity_wv if entity_node.has_label("api"): entity_type = "api" wv_model = self._api_wv if wv_model is None or entity_type is "": return self.NP_VECTOR_NOT_EXIST vector_key_name = self.WORDVECTOR_KEY_MAP[entity_type] word2vec_id_string = str(entity_node[vector_key_name]) if word2vec_id_string not in wv_model.vocab: return self.NP_VECTOR_NOT_EXIST return wv_model[word2vec_id_string] def get_question_entity_vector(self, question): question_vec = self._entity_vector_compute_model.compute_mean_vector(question, need_process=True) return question_vec def get_question_graph_vector_by_average_all_entities(self, question, entity_graph_vec_list): question_graph_vec = np.mean(entity_graph_vec_list, axis=0) return question_graph_vec def get_question_graph_vector_by_semantic_weight_all_entities(self, question_context_vec, entity_context_vec_list, entity_graph_vec_list): qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec, entity_context_vec_list) qe_sim_np = qe_sim_np / qe_sim_np.sum() question_graph_vec = (qe_sim_np * np.matrix(entity_graph_vec_list)).getA()[0] return question_graph_vec def get_vectors_for_entity_list(self, entity_list): entity_graph_vec_list = [] entity_vec_list = [] for entity in entity_list: entity_vec_list.append(self.get_entity_vec(entity)) entity_graph_vec_list.append(self.get_entity_graph_vec(entity)) return entity_vec_list, entity_graph_vec_list def get_entity_graph_vec(self, entity): wv_model = self._graph_wv word2vec_id_string = str(self.defaultAccessor.get_id_for_node(entity)) if word2vec_id_string not in wv_model.vocab: return self.NP_VECTOR_NOT_EXIST return wv_model[word2vec_id_string]
class TestGraphClient(TestCase): def setUp(self): self.graphClient = DefaultGraphAccessor(GraphClient()) def test_expand_nodes(self): test_case = [(55730, True, 50, 50), (15555, True, 4, 4), (93008, True, 10, 11), (1708, True, 8, 7)] for node_id, is_valid, node_num, relation_num in test_case: subgraph = self.graphClient.expand_node_for_adjacent_nodes_to_subgraph( node_id) if is_valid: self.assertIsNotNone(subgraph) else: self.assertIsNone(subgraph) continue print(subgraph) self.assertEqual(node_num, len(subgraph.nodes())) self.assertEqual(relation_num, len(subgraph.relationships())) for n in subgraph.nodes(): print(n) for r in subgraph.relationships(): print(r) for r in subgraph.relationships(): is_contain = self.graphClient.get_id_for_node(r.start_node( )) == node_id or self.graphClient.get_id_for_node( r.end_node()) == node_id self.assertTrue(is_contain) is_contain = False for n in subgraph.nodes(): is_contain = self.graphClient.get_id_for_node(n) == node_id if is_contain == True: break self.assertTrue(is_contain) def test_expand_nodes_with_filter_nodes(self): graphClient = DefaultGraphAccessor(GraphClient()) # test_case = [(55730, True, 50, 50), (15555, True, 4, 4), (93008, True, 10, 11), (1708, True, 8, 7)] test_case = [ (55730, True, 50, 50), ] graphJsonParser = GraphJsonParser() for node_id, is_valid, node_num, relation_num in test_case: print("test case=", node_id, is_valid, node_num, relation_num) subgraph = graphClient.expand_node_for_adjacent_nodes_to_subgraph( node_id) subgraph_json = graphJsonParser.parse_subgraph_to_public_json( subgraph) print(subgraph_json) if is_valid: self.assertNotEqual(subgraph_json, { "nodes": [], "relations": [] }) else: self.assertEqual(subgraph_json, {"nodes": [], "relations": []}) continue self.assertEqual(node_num, len(subgraph_json["nodes"])) self.assertEqual(relation_num, len(subgraph_json["relations"])) for n in subgraph_json["nodes"]: print(n) for r in subgraph_json["relations"]: print(r)