Python CodeDocPreprocessor示例，sekg.ir.preprocessor.code_text.CodeDocPreprocessor Python示例

示例#1

0

显示文件

    def __init__(self, term_save_path, operation_save_path,
                 term_relation_save_path, linkage_save_path, aliase_save_path,
                 pre_doc_collection_out_path):
        self.terms = EntityReader.read_line_data(term_save_path)
        # self.operations = EntityReader.read_line_data(operation_save_path)
        self.relations = EntityReader.read_json_data(term_relation_save_path)
        self.linkages = EntityReader.read_json_data(linkage_save_path)
        self.aliases_map = EntityReader.read_json_data(aliase_save_path)
        self.pre_doc_collection_out_path = pre_doc_collection_out_path
        self.uncamel_util = ConceptElementNameUtil()
        self.code_pre = CodeDocPreprocessor()
        self.all_words = {}
        self.uncamel_map = {}

        self.start_relation = {}
        self.end_relation = {}
        self.all_relation = {}

        self.start_record_for_linkage = {}
        self.end_record_for_linkage = {}
        self.start_with_r_record = {}
        self.end_with_r_record = {}
        self.mention_time = {}
        self.operation_time = {}
        self.represent_time = {}
        self.instance_of_time = {}
        self.end_related_relation_num = {}

        self.sum_mention_time = {}

        self.init_cal()
        self.count_all_word()

示例#2

0

显示文件

文件： train.py 项目： nengnengwu/APIKG-Summary

def train_model(pro_name, version, first_model_config, second_model_config):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)

    sub_search_model_config = [
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=first_model_config[0]),
         first_model_config[1], first_model_config[2], False),
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=second_model_config[0]),
         second_model_config[1], second_model_config[2], True),
    ]

    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model=first_model_config[0], extra_model=second_model_config[0])

    print("try to model compound model for %r" % compound_model_name)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type=compound_model_name)

    model = CompoundSearchModel.train(
        model_dir_path=model_dir_path,
        doc_collection=doc_collection,
        sub_search_model_config=sub_search_model_config)

    return model_dir_path

示例#3

0

显示文件

def build_jdk_all_graph_and_doc():
    pro_name = "jdk8"
    build_v1_jdk()
    build_v2_graph_for_pro(pro_name)
    build_v2_1_graph_for_pro(pro_name)
    build_v3_graph_for_pro(pro_name)
    version_list = ["v1", "v2", "v3"]
    # build_extra_model_and_doc(pro_name, version_list)
    for version in version_list:
        build_doc(pro_name, version)
        build_pre_doc(pro_name, version, CodeDocPreprocessor())

示例#4

0

显示文件

文件： build_doc_collection_after_process.py 项目： nengnengwu/APIKG-Summary

def build_doc(pro_name, version):
    input_doc_collection_path = PathUtil.doc(pro_name=pro_name,
                                             version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name,
                                                      version=version,
                                                      pre_way="code-pre")
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        input_doc_collection_path)
    precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)
    precess_doc_collection.save(output_pre_doc_collection_path)

示例#5

0

显示文件

文件： graph_builder.py 项目： nengnengwu/APIKG-Summary

    def build_pre_doc(self, input_doc_collection_path, output_pre_doc_collection_path, preprocessor=None):

        if preprocessor == None:
            preprocessor = CodeDocPreprocessor()

        print("stat preprocess doc - for %s %r " % (input_doc_collection_path, preprocessor))
        doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(input_doc_collection_path)
        precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
            preprocessor=preprocessor, doc_collection=doc_collection)

        precess_doc_collection.save(output_pre_doc_collection_path)
        print("end preprocess doc - %r %r " % (output_pre_doc_collection_path, preprocessor))

示例#6

0

显示文件

文件： train.py 项目： nengnengwu/APIKG-Summary

def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path

示例#7

0

显示文件

def build_extra_model_and_doc(pro_name, version_list):
    for version in version_list:
        preprocessors = [CodeDocPreprocessor()]
        pre_way = "code-pre"
        build_doc(pro_name=pro_name, version=version)
        for preprocessor in preprocessors:
            build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor)

        train_name_searcher(pro_name=pro_name, version=version)

        pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way)
        preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load(
            pre_doc_collection_path)

        word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
        AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                            doc_collection=preprocess_doc_collection)

示例#8

0

显示文件

文件： graph_builder.py 项目： nengnengwu/APIKG-Summary

    def build_v3_graph_from_cache_simple(self, pro_name,
                                         input_graph_data_path,
                                         word2vec_model_path,
                                         output_graph_data_path,
                                         generic_title_search_cache_path,
                                         generic_wikidata_item_cache_path,
                                         fusion_temp_result_dir,
                                         ):
        print("start adding wikidata knowledge for %s" % pro_name)

        fusion = GenericKGFusion()
        fusion.init_graph_data(input_graph_data_path)
        fusion.init_wd_from_cache(title_save_path=generic_title_search_cache_path,
                                  item_save_path=generic_wikidata_item_cache_path)
        fusion.add_all_wiki_nodes()

        builder = GraphNodeDocumentBuilder(graph_data=fusion.graph_data)
        doc_collection = builder.build_doc_for_kg()

        preprocess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
            preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)

        AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                            doc_collection=preprocess_doc_collection)

        fusion.load_w2v_model(word2vec_model_path)

        record = fusion.simple_fuse()

        fusion_temp_result_dir = Path(fusion_temp_result_dir)
        with Path(str(fusion_temp_result_dir / "record.json")).open("w", encoding="utf-8") as f:
            json.dump(record, f, indent=4)

        fusion.graph_data.add_label_to_all(pro_name)
        fusion.save(output_graph_data_path)
        print("end adding wikidata knowledge for %s" % pro_name)

        return fusion.graph_data

示例#9

0

显示文件

文件： train.py 项目： nengnengwu/APIKG-Summary

def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path

示例#10

0

显示文件

class ReduceDomainTerm:
    def __init__(self, term_save_path, operation_save_path,
                 term_relation_save_path, linkage_save_path, aliase_save_path,
                 pre_doc_collection_out_path):
        self.terms = EntityReader.read_line_data(term_save_path)
        # self.operations = EntityReader.read_line_data(operation_save_path)
        self.relations = EntityReader.read_json_data(term_relation_save_path)
        self.linkages = EntityReader.read_json_data(linkage_save_path)
        self.aliases_map = EntityReader.read_json_data(aliase_save_path)
        self.pre_doc_collection_out_path = pre_doc_collection_out_path
        self.uncamel_util = ConceptElementNameUtil()
        self.code_pre = CodeDocPreprocessor()
        self.all_words = {}
        self.uncamel_map = {}

        self.start_relation = {}
        self.end_relation = {}
        self.all_relation = {}

        self.start_record_for_linkage = {}
        self.end_record_for_linkage = {}
        self.start_with_r_record = {}
        self.end_with_r_record = {}
        self.mention_time = {}
        self.operation_time = {}
        self.represent_time = {}
        self.instance_of_time = {}
        self.end_related_relation_num = {}

        self.sum_mention_time = {}

        self.init_cal()
        self.count_all_word()

    def init_cal(self):
        for start, r, end in self.relations:
            self.start_relation[start] = self.start_relation.get(start, 0) + 1
            self.end_relation[end] = self.end_relation.get(end, 0) + 1
            self.all_relation[start] = self.all_relation.get(start, 0) + 1
            self.all_relation[end] = self.all_relation.get(end, 0) + 1

        for start, r, end in self.linkages:
            start = str(start)
            end = str(end)
            self.start_record_for_linkage[
                start] = self.start_record_for_linkage.get(start, 0) + 1
            self.end_record_for_linkage[end] = self.end_record_for_linkage.get(
                end, 0) + 1
            self.start_with_r_record[start + "_" +
                                     r] = self.start_with_r_record.get(
                                         start + "_" + r, 0) + 1
            self.end_with_r_record[end + "_" + r] = self.end_with_r_record.get(
                end + "_" + r, 0) + 1
            if r.startswith("mention"):
                self.mention_time[end] = self.mention_time.get(end, 0) + 1
            if r.startswith("operation"):
                self.operation_time[end] = self.operation_time.get(end, 0) + 1
            if r.startswith("instance of"):
                self.instance_of_time[end] = self.instance_of_time.get(end,
                                                                       0) + 1
            if r.startswith("represent"):
                self.represent_time[end] = self.represent_time.get(end, 0) + 1

            self.end_related_relation_num[
                end] = self.end_related_relation_num.get(end, 0) + 1

        for term, num in self.mention_time.items():
            term_words = set(term.lower().split())
            term_word_num = len(term_words)
            for other_term, other_num in self.mention_time.items():
                if len(set(other_term.lower().split())
                       & term_words) == term_word_num:
                    self.sum_mention_time[term] = self.mention_time.get(
                        other_term) + self.sum_mention_time.get(term, 0)
        print("init cal finished!")

    def count_all_word(self, ):
        for item in self.terms:

            uncamel_str_list = self.code_pre.clean(item)
            self.uncamel_map[item] = uncamel_str_list
            for word in uncamel_str_list:
                self.all_words[word] = self.all_words.get(word, 0) + 1
        print("init count_all_word finished!")
        # print(self.uncamel_map)

    def two_hop_delete(self, threshold=2):
        need_remove = set()
        for start, r, end in self.relations:
            if start not in self.start_record_for_linkage and start not in self.end_record_for_linkage and end not in self.start_record_for_linkage and end not in self.end_record_for_linkage:
                need_remove.add(start)
                need_remove.add(end)
            else:
                if start in need_remove:
                    need_remove.remove(start)
                if end in need_remove:
                    need_remove.remove(end)
        need_remove = [(key, self.all_relation[key]) for key in need_remove]
        need_remove = sorted(need_remove, key=lambda x: x[1], reverse=True)
        move = [key for key, num in need_remove if num < threshold]
        return move

    def delete_based_on_name(self,
                             sim_threshold=0.5,
                             tf_threshold=3,
                             mention_threshold=2):
        move_sim = []
        move_term = set()
        for term in self.terms:
            uncamel_name_list = self.uncamel_map.get(
                term,
                self.uncamel_util.uncamelize_by_stemming(term).split(" "))
            sim = self.cal_sim(uncamel_name_list, self.all_words)
            move_sim.append((term, sim))
            if not uncamel_name_list:
                move_term.add(term)
        move_sim = sorted(move_sim, key=lambda x: x[1])
        move_sim = [item[0] for item in move_sim if item[1] < sim_threshold]

        for item in move_sim:
            if item not in self.end_record_for_linkage and item not in self.start_record_for_linkage:
                tf = self.all_relation.get(item, 0)
                if tf <= tf_threshold:
                    move_term.add(item)
            else:
                if item in self.represent_time or item in self.operation_time or item in self.instance_of_time:
                    continue
                if item in self.mention_time:
                    if self.mention_time[item] <= mention_threshold:
                        move_term.add(item)
        return list(move_term)

    def cal_sim(self, name_list, all_words):
        if not len(name_list):
            return 0
        same_count = 0
        for item in name_list:
            if item in all_words.keys():
                if all_words[item] > 1:
                    same_count += 1
        return float(same_count) / float(len(name_list))

    def delete_based_on_aliase_tf(self, sim_threshold=0.7):
        preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load(
            self.pre_doc_collection_out_path)
        preprocess_multi_field_doc_list = preprocess_doc_collection.get_all_preprocess_document_list(
        )
        corpus_clean_text = []
        for docno, multi_field_doc in enumerate(
                preprocess_multi_field_doc_list):
            corpus_clean_text.append(multi_field_doc.get_document_text_words())
        dict = Dictionary(corpus_clean_text)

        alise_tf_map = {}
        alise_score_map = {}
        for item in self.terms:
            current_alise = self.aliases_map.get(item, "")
            current_alise = [x.lower() for x in current_alise]
            current_alise = set(current_alise)
            current_alise.add(item.lower())
            code_pre_set = set()
            for alise in current_alise:
                code_pre_set.update(set(self.code_pre.clean(alise)))
            word_tf = []
            tf_sum = 0
            for word in code_pre_set:
                if word in dict.token2id:
                    tf_value = dict.cfs[dict.token2id[word]]
                    tf_sum += tf_value
                    word_tf.append((word, tf_value))
                else:
                    word_tf.append((word, 0))
            alise_tf_map[item] = word_tf
            if tf_sum == 0:
                alise_score_map[item] = 0
            else:
                alise_score_map[item] = float(tf_sum) / float(
                    len(code_pre_set))
        move_item = [
            key for key in alise_tf_map if alise_score_map[key] < sim_threshold
        ]
        return move_item

    def delete_based_on_name_length(self,
                                    length_threshold=30,
                                    number_threshold=3):
        move_item = []
        for item in self.terms:
            if len(item) > length_threshold and len(
                    item.split(" ")) > number_threshold:
                move_item.append(item)
        return move_item

    def save(self, ):
        EntityReader.write_json_data(
            str(Path(domain_dir) / "start_record.json"),
            self.start_record_for_linkage)
        EntityReader.write_json_data(
            str(Path(domain_dir) / "start_record_relation.json"),
            self.start_with_r_record)
        EntityReader.write_line_data(
            str(Path(domain_dir) / "start_record_relation.txt"),
            [k + ":" + str(v) for k, v in self.start_with_r_record.items()])

        EntityReader.write_json_data(str(Path(domain_dir) / "end_record.json"),
                                     self.end_record_for_linkage)
        EntityReader.write_json_data(
            str(Path(domain_dir) / "end_record_relation.json"),
            self.end_with_r_record)
        EntityReader.write_line_data(
            str(Path(domain_dir) / "end_record_relation.txt"),
            [k + ":" + str(v) for k, v in self.end_with_r_record.items()])

        EntityReader.write_line_data(
            str(Path(domain_dir) / "mention_num.txt"),
            [str(v) + ":" + str(k) for k, v in self.mention_time.items()])
        EntityReader.write_line_data(
            str(Path(domain_dir) / "sum_mention_time.txt"),
            [str(v) + ":" + str(k) for k, v in self.sum_mention_time.items()])

        EntityReader.write_line_data(
            str(Path(domain_dir) / "end_related_relation_num.txt"), [
                str(v) + ":" + str(k)
                for k, v in self.end_related_relation_num.items()
            ])