def __init__(self, term_save_path, operation_save_path, term_relation_save_path, linkage_save_path, aliase_save_path, pre_doc_collection_out_path): self.terms = EntityReader.read_line_data(term_save_path) # self.operations = EntityReader.read_line_data(operation_save_path) self.relations = EntityReader.read_json_data(term_relation_save_path) self.linkages = EntityReader.read_json_data(linkage_save_path) self.aliases_map = EntityReader.read_json_data(aliase_save_path) self.pre_doc_collection_out_path = pre_doc_collection_out_path self.uncamel_util = ConceptElementNameUtil() self.code_pre = CodeDocPreprocessor() self.all_words = {} self.uncamel_map = {} self.start_relation = {} self.end_relation = {} self.all_relation = {} self.start_record_for_linkage = {} self.end_record_for_linkage = {} self.start_with_r_record = {} self.end_with_r_record = {} self.mention_time = {} self.operation_time = {} self.represent_time = {} self.instance_of_time = {} self.end_related_relation_num = {} self.sum_mention_time = {} self.init_cal() self.count_all_word()
def train_model(pro_name, version, first_model_config, second_model_config): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) sub_search_model_config = [ (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=first_model_config[0]), first_model_config[1], first_model_config[2], False), (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=second_model_config[0]), second_model_config[1], second_model_config[2], True), ] compound_model_name = "compound_{base_model}+{extra_model}".format( base_model=first_model_config[0], extra_model=second_model_config[0]) print("try to model compound model for %r" % compound_model_name) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.train( model_dir_path=model_dir_path, doc_collection=doc_collection, sub_search_model_config=sub_search_model_config) return model_dir_path
def build_jdk_all_graph_and_doc(): pro_name = "jdk8" build_v1_jdk() build_v2_graph_for_pro(pro_name) build_v2_1_graph_for_pro(pro_name) build_v3_graph_for_pro(pro_name) version_list = ["v1", "v2", "v3"] # build_extra_model_and_doc(pro_name, version_list) for version in version_list: build_doc(pro_name, version) build_pre_doc(pro_name, version, CodeDocPreprocessor())
def build_doc(pro_name, version): input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way="code-pre") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path)
def build_pre_doc(self, input_doc_collection_path, output_pre_doc_collection_path, preprocessor=None): if preprocessor == None: preprocessor = CodeDocPreprocessor() print("stat preprocess doc - for %s %r " % (input_doc_collection_path, preprocessor)) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=preprocessor, doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path) print("end preprocess doc - %r %r " % (output_pre_doc_collection_path, preprocessor))
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def build_extra_model_and_doc(pro_name, version_list): for version in version_list: preprocessors = [CodeDocPreprocessor()] pre_way = "code-pre" build_doc(pro_name=pro_name, version=version) for preprocessor in preprocessors: build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor) train_name_searcher(pro_name=pro_name, version=version) pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load( pre_doc_collection_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=preprocess_doc_collection)
def build_v3_graph_from_cache_simple(self, pro_name, input_graph_data_path, word2vec_model_path, output_graph_data_path, generic_title_search_cache_path, generic_wikidata_item_cache_path, fusion_temp_result_dir, ): print("start adding wikidata knowledge for %s" % pro_name) fusion = GenericKGFusion() fusion.init_graph_data(input_graph_data_path) fusion.init_wd_from_cache(title_save_path=generic_title_search_cache_path, item_save_path=generic_wikidata_item_cache_path) fusion.add_all_wiki_nodes() builder = GraphNodeDocumentBuilder(graph_data=fusion.graph_data) doc_collection = builder.build_doc_for_kg() preprocess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=preprocess_doc_collection) fusion.load_w2v_model(word2vec_model_path) record = fusion.simple_fuse() fusion_temp_result_dir = Path(fusion_temp_result_dir) with Path(str(fusion_temp_result_dir / "record.json")).open("w", encoding="utf-8") as f: json.dump(record, f, indent=4) fusion.graph_data.add_label_to_all(pro_name) fusion.save(output_graph_data_path) print("end adding wikidata knowledge for %s" % pro_name) return fusion.graph_data
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
class ReduceDomainTerm: def __init__(self, term_save_path, operation_save_path, term_relation_save_path, linkage_save_path, aliase_save_path, pre_doc_collection_out_path): self.terms = EntityReader.read_line_data(term_save_path) # self.operations = EntityReader.read_line_data(operation_save_path) self.relations = EntityReader.read_json_data(term_relation_save_path) self.linkages = EntityReader.read_json_data(linkage_save_path) self.aliases_map = EntityReader.read_json_data(aliase_save_path) self.pre_doc_collection_out_path = pre_doc_collection_out_path self.uncamel_util = ConceptElementNameUtil() self.code_pre = CodeDocPreprocessor() self.all_words = {} self.uncamel_map = {} self.start_relation = {} self.end_relation = {} self.all_relation = {} self.start_record_for_linkage = {} self.end_record_for_linkage = {} self.start_with_r_record = {} self.end_with_r_record = {} self.mention_time = {} self.operation_time = {} self.represent_time = {} self.instance_of_time = {} self.end_related_relation_num = {} self.sum_mention_time = {} self.init_cal() self.count_all_word() def init_cal(self): for start, r, end in self.relations: self.start_relation[start] = self.start_relation.get(start, 0) + 1 self.end_relation[end] = self.end_relation.get(end, 0) + 1 self.all_relation[start] = self.all_relation.get(start, 0) + 1 self.all_relation[end] = self.all_relation.get(end, 0) + 1 for start, r, end in self.linkages: start = str(start) end = str(end) self.start_record_for_linkage[ start] = self.start_record_for_linkage.get(start, 0) + 1 self.end_record_for_linkage[end] = self.end_record_for_linkage.get( end, 0) + 1 self.start_with_r_record[start + "_" + r] = self.start_with_r_record.get( start + "_" + r, 0) + 1 self.end_with_r_record[end + "_" + r] = self.end_with_r_record.get( end + "_" + r, 0) + 1 if r.startswith("mention"): self.mention_time[end] = self.mention_time.get(end, 0) + 1 if r.startswith("operation"): self.operation_time[end] = self.operation_time.get(end, 0) + 1 if r.startswith("instance of"): self.instance_of_time[end] = self.instance_of_time.get(end, 0) + 1 if r.startswith("represent"): self.represent_time[end] = self.represent_time.get(end, 0) + 1 self.end_related_relation_num[ end] = self.end_related_relation_num.get(end, 0) + 1 for term, num in self.mention_time.items(): term_words = set(term.lower().split()) term_word_num = len(term_words) for other_term, other_num in self.mention_time.items(): if len(set(other_term.lower().split()) & term_words) == term_word_num: self.sum_mention_time[term] = self.mention_time.get( other_term) + self.sum_mention_time.get(term, 0) print("init cal finished!") def count_all_word(self, ): for item in self.terms: uncamel_str_list = self.code_pre.clean(item) self.uncamel_map[item] = uncamel_str_list for word in uncamel_str_list: self.all_words[word] = self.all_words.get(word, 0) + 1 print("init count_all_word finished!") # print(self.uncamel_map) def two_hop_delete(self, threshold=2): need_remove = set() for start, r, end in self.relations: if start not in self.start_record_for_linkage and start not in self.end_record_for_linkage and end not in self.start_record_for_linkage and end not in self.end_record_for_linkage: need_remove.add(start) need_remove.add(end) else: if start in need_remove: need_remove.remove(start) if end in need_remove: need_remove.remove(end) need_remove = [(key, self.all_relation[key]) for key in need_remove] need_remove = sorted(need_remove, key=lambda x: x[1], reverse=True) move = [key for key, num in need_remove if num < threshold] return move def delete_based_on_name(self, sim_threshold=0.5, tf_threshold=3, mention_threshold=2): move_sim = [] move_term = set() for term in self.terms: uncamel_name_list = self.uncamel_map.get( term, self.uncamel_util.uncamelize_by_stemming(term).split(" ")) sim = self.cal_sim(uncamel_name_list, self.all_words) move_sim.append((term, sim)) if not uncamel_name_list: move_term.add(term) move_sim = sorted(move_sim, key=lambda x: x[1]) move_sim = [item[0] for item in move_sim if item[1] < sim_threshold] for item in move_sim: if item not in self.end_record_for_linkage and item not in self.start_record_for_linkage: tf = self.all_relation.get(item, 0) if tf <= tf_threshold: move_term.add(item) else: if item in self.represent_time or item in self.operation_time or item in self.instance_of_time: continue if item in self.mention_time: if self.mention_time[item] <= mention_threshold: move_term.add(item) return list(move_term) def cal_sim(self, name_list, all_words): if not len(name_list): return 0 same_count = 0 for item in name_list: if item in all_words.keys(): if all_words[item] > 1: same_count += 1 return float(same_count) / float(len(name_list)) def delete_based_on_aliase_tf(self, sim_threshold=0.7): preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load( self.pre_doc_collection_out_path) preprocess_multi_field_doc_list = preprocess_doc_collection.get_all_preprocess_document_list( ) corpus_clean_text = [] for docno, multi_field_doc in enumerate( preprocess_multi_field_doc_list): corpus_clean_text.append(multi_field_doc.get_document_text_words()) dict = Dictionary(corpus_clean_text) alise_tf_map = {} alise_score_map = {} for item in self.terms: current_alise = self.aliases_map.get(item, "") current_alise = [x.lower() for x in current_alise] current_alise = set(current_alise) current_alise.add(item.lower()) code_pre_set = set() for alise in current_alise: code_pre_set.update(set(self.code_pre.clean(alise))) word_tf = [] tf_sum = 0 for word in code_pre_set: if word in dict.token2id: tf_value = dict.cfs[dict.token2id[word]] tf_sum += tf_value word_tf.append((word, tf_value)) else: word_tf.append((word, 0)) alise_tf_map[item] = word_tf if tf_sum == 0: alise_score_map[item] = 0 else: alise_score_map[item] = float(tf_sum) / float( len(code_pre_set)) move_item = [ key for key in alise_tf_map if alise_score_map[key] < sim_threshold ] return move_item def delete_based_on_name_length(self, length_threshold=30, number_threshold=3): move_item = [] for item in self.terms: if len(item) > length_threshold and len( item.split(" ")) > number_threshold: move_item.append(item) return move_item def save(self, ): EntityReader.write_json_data( str(Path(domain_dir) / "start_record.json"), self.start_record_for_linkage) EntityReader.write_json_data( str(Path(domain_dir) / "start_record_relation.json"), self.start_with_r_record) EntityReader.write_line_data( str(Path(domain_dir) / "start_record_relation.txt"), [k + ":" + str(v) for k, v in self.start_with_r_record.items()]) EntityReader.write_json_data(str(Path(domain_dir) / "end_record.json"), self.end_record_for_linkage) EntityReader.write_json_data( str(Path(domain_dir) / "end_record_relation.json"), self.end_with_r_record) EntityReader.write_line_data( str(Path(domain_dir) / "end_record_relation.txt"), [k + ":" + str(v) for k, v in self.end_with_r_record.items()]) EntityReader.write_line_data( str(Path(domain_dir) / "mention_num.txt"), [str(v) + ":" + str(k) for k, v in self.mention_time.items()]) EntityReader.write_line_data( str(Path(domain_dir) / "sum_mention_time.txt"), [str(v) + ":" + str(k) for k, v in self.sum_mention_time.items()]) EntityReader.write_line_data( str(Path(domain_dir) / "end_related_relation_num.txt"), [ str(v) + ":" + str(k) for k, v in self.end_related_relation_num.items() ])