Пример #1
0
def train_model(pro_name, version, first_model_config, second_model_config):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)

    sub_search_model_config = [
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=first_model_config[0]),
         first_model_config[1], first_model_config[2], False),
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=second_model_config[0]),
         second_model_config[1], second_model_config[2], True),
    ]

    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model=first_model_config[0], extra_model=second_model_config[0])

    print("try to model compound model for %r" % compound_model_name)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type=compound_model_name)

    model = CompoundSearchModel.train(
        model_dir_path=model_dir_path,
        doc_collection=doc_collection,
        sub_search_model_config=sub_search_model_config)

    return model_dir_path
def build_doc(pro_name, version):
    input_doc_collection_path = PathUtil.doc(pro_name=pro_name,
                                             version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name,
                                                      version=version,
                                                      pre_way="code-pre")
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        input_doc_collection_path)
    precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)
    precess_doc_collection.save(output_pre_doc_collection_path)
Пример #3
0
def train_model(pro_name, version):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = Preprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="bm25")
    BM25Model.train(model_dir_path, doc_collection=doc_collection)
    return model_dir_path
Пример #4
0
def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path
Пример #5
0
def build_pre_doc(pro_name, version, preprocessor):
    pre_way = "unknown-pre"
    if isinstance(preprocessor, SimplePreprocessor):
        pre_way = "sim-pre"
    if isinstance(preprocessor, SpacyTextPreprocessor):
        pre_way = "spacy-pre"
    if isinstance(preprocessor, CodeDocPreprocessor):
        pre_way = "code-pre"
    if isinstance(preprocessor, PureCodePreprocessor):
        pre_way = "pure-pre"

    input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way)

    builder = CodeGraphBuilder()
    builder.build_pre_doc(input_doc_collection_path, output_pre_doc_collection_path, preprocessor)
Пример #6
0
 def __init__(self, pro_name, version):
     self.model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                              version=version,
                                              model_type="svm")
     self.model = FilterSemanticTFIDFNode2VectorModel(
         name="svm", model_dir_path=self.model_dir_path)
     self.document_collection_path = PathUtil.doc(pro_name, version)
     self.collection = MultiFieldDocumentCollection.load(
         str(self.document_collection_path))
     self.processor = Preprocessor()
     self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
         self.processor, self.collection)
     self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name,
                                                     version=version,
                                                     weight="unweight")
     self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version)
     self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name,
                                                  version=version,
                                                  model_type="avg_w2v")
Пример #7
0
def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path
Пример #8
0
def build_doc(pro_name, version):
    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)
    document_collection_path = PathUtil.doc(pro_name=pro_name, version=version)

    builder = CodeGraphBuilder()
    builder.build_doc(graph_data_path=graph_data_path, output_doc_collection_path=document_collection_path)