示例#1
0
def train_model(pro_name, version, first_model_config, second_model_config):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)

    sub_search_model_config = [
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=first_model_config[0]),
         first_model_config[1], first_model_config[2], False),
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=second_model_config[0]),
         second_model_config[1], second_model_config[2], True),
    ]

    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model=first_model_config[0], extra_model=second_model_config[0])

    print("try to model compound model for %r" % compound_model_name)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type=compound_model_name)

    model = CompoundSearchModel.train(
        model_dir_path=model_dir_path,
        doc_collection=doc_collection,
        sub_search_model_config=sub_search_model_config)

    return model_dir_path
 def create_search_model(pro_name, version, model_dir):
     sub_search_model_config_path = model_dir / "submodel.config"
     with open(sub_search_model_config_path, 'rb') as aq:
         sub_search_model_config = pickle.loads(aq.read())
     model_1 = PathUtil.sim_model(pro_name, version, "avg_w2v")
     model_2 = PathUtil.sim_model(pro_name, version, "svm")
     new_sub_search_model_config = [
         (model_1, sub_search_model_config[0][1],
          sub_search_model_config[0][2], sub_search_model_config[0][3]),
         (model_2, sub_search_model_config[1][1],
          sub_search_model_config[1][2], sub_search_model_config[1][3]),
     ]
     with open(sub_search_model_config_path, 'wb') as out:
         out.write(pickle.dumps(new_sub_search_model_config))
     model = CompoundSearchModel.load(model_dir)
     return model
示例#3
0
 def __init__(self, pro_name, version):
     self.model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                              version=version,
                                              model_type="svm")
     self.model = FilterSemanticTFIDFNode2VectorModel(
         name="svm", model_dir_path=self.model_dir_path)
     self.document_collection_path = PathUtil.doc(pro_name, version)
     self.collection = MultiFieldDocumentCollection.load(
         str(self.document_collection_path))
     self.processor = Preprocessor()
     self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
         self.processor, self.collection)
     self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name,
                                                     version=version,
                                                     weight="unweight")
     self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version)
     self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name,
                                                  version=version,
                                                  model_type="avg_w2v")
示例#4
0
def train_model(pro_name, version):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = Preprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="bm25")
    BM25Model.train(model_dir_path, doc_collection=doc_collection)
    return model_dir_path
示例#5
0
def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path
示例#6
0
def build_extra_model_and_doc(pro_name, version_list):
    for version in version_list:
        preprocessors = [CodeDocPreprocessor()]
        pre_way = "code-pre"
        build_doc(pro_name=pro_name, version=version)
        for preprocessor in preprocessors:
            build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor)

        train_name_searcher(pro_name=pro_name, version=version)

        pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way)
        preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load(
            pre_doc_collection_path)

        word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
        AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                            doc_collection=preprocess_doc_collection)
示例#7
0
def build_v3_graph_for_pro(pro_name):
    builder = CodeGraphBuilder()
    input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3")

    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version="v3", model_type="avg_w2v")
    output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3")
    generic_wikidata_item_cache_path = PathUtil.generic_wikidata_item_cache()
    wikidata_fusion_temp_result_dir = PathUtil.wikidata_fusion_temp_result_dir(pro_name)

    graph_data = builder.build_v3_graph_from_cache_simple(pro_name=pro_name,
                                                          input_graph_data_path=input_graph_data_path,
                                                          word2vec_model_path=word2vec_model_path,
                                                          output_graph_data_path=output_graph_data_path,
                                                          generic_title_search_cache_path=None,
                                                          generic_wikidata_item_cache_path=generic_wikidata_item_cache_path,
                                                          fusion_temp_result_dir=wikidata_fusion_temp_result_dir,
                                                          )
    graph_data.print_graph_info()
示例#8
0
def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path
示例#9
0
from script.summary.generate_summary import Summary
from util.path_util import PathUtil

if __name__ == '__main__':
    pro_name = "jdk8"
    version = "v3_1"
    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model="avg_w2v", extra_model="svm")
    model_dir = PathUtil.sim_model(pro_name=pro_name,
                                   version=version,
                                   model_type=compound_model_name)
    summary = Summary(pro_name, version, model_dir)
    while True:
        query = input("please input query:")
        class_name = input("please input qualified class name")
        all_class_2_summary = summary.get_summary(query, class_name)
        for index, item in all_class_2_summary.items():
            print(index, item)
示例#10
0
from sekg.graph.exporter.graph_data import GraphData
from sekg.ir.models.n2v.svm.avg_n2v import AVGNode2VectorModel

from util.path_util import PathUtil

if __name__ == '__main__':
    pro_name = "jdk8"
    version = "v3"
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="avg_n2v")
    model = AVGNode2VectorModel.load(model_dir_path)
    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)
    graph_data: GraphData = GraphData.load(graph_data_path)
    valid_class_ids = graph_data.get_node_ids_by_label("class")
    valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label(
        "class type")
    valid_method_ids = graph_data.get_node_ids_by_label("method")
    valid_method_ids.update(
        graph_data.get_node_ids_by_label("base override method"))
    valid_sentence_ids = graph_data.get_node_ids_by_label("sentence")
    while True:
        query = input("please input query: ")
        select = int(input("1、class; 2、methos; 3、sentence"))
        top_num = int(input("please input top num"))
        result = []
        if select == 1:
            result = model.search(query=query,
                                  top_num=top_num,
                                  valid_doc_id_set=valid_class_ids)
        elif select == 2:
示例#11
0
from sekg.graph.exporter.graph_data import GraphData
from sekg.ir.models.n2v.svm.filter_semantic_tfidf_n2v import FilterSemanticTFIDFNode2VectorModel

from util.path_util import PathUtil

if __name__ == '__main__':
    model_dir_path = PathUtil.sim_model(pro_name="jdk8",
                                        version="v3",
                                        model_type="svm")
    model = FilterSemanticTFIDFNode2VectorModel.load(model_dir_path)
    graph_data_path = PathUtil.graph_data(pro_name="jdk8", version="v3")
    graph_data: GraphData = GraphData.load(graph_data_path)
    valid_class_ids = graph_data.get_node_ids_by_label("class")
    valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label(
        "class type")
    valid_method_ids = graph_data.get_node_ids_by_label("method")
    valid_method_ids.update(
        graph_data.get_node_ids_by_label("base override method"))
    valid_sentence_ids = graph_data.get_node_ids_by_label("sentence")
    while True:
        query = input("please input query: ")
        select = int(input("1、class; 2、methos; 3、sentence"))
        top_num = int(input("please input top num"))
        result = []
        if select == 1:
            result = model.search(query=query,
                                  top_num=top_num,
                                  valid_doc_id_set=valid_class_ids)
        elif select == 2:
            result = model.search(query=query,
                                  top_num=top_num,
示例#12
0
from sekg.graph.exporter.graph_data import GraphData
from sekg.ir.models.avg_w2v import AVGW2VFLModel
from sekg.ir.models.bm25 import BM25Model

from util.path_util import PathUtil

if __name__ == '__main__':
    pro_name = "jdk8"
    version = "v3_1"
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="bm25")
    model = BM25Model.load(model_dir_path)
    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)
    graph_data: GraphData = GraphData.load(graph_data_path)
    valid_class_ids = graph_data.get_node_ids_by_label("class")
    valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label(
        "class type")
    valid_method_ids = graph_data.get_node_ids_by_label("method")
    valid_method_ids.update(
        graph_data.get_node_ids_by_label("base override method"))
    valid_sentence_ids = graph_data.get_node_ids_by_label("sentence")
    while True:
        query = input("please input query: ")
        select = int(input("1、class; 2、methos; 3、sentence"))
        top_num = int(input("please input top num"))
        result = []
        if select == 1:
            result = model.search(query=query,
                                  top_num=top_num,
                                  valid_doc_id_set=valid_class_ids)