def build_doc(pro_name, version):
    input_doc_collection_path = PathUtil.doc(pro_name=pro_name,
                                             version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name,
                                                      version=version,
                                                      pre_way="code-pre")
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        input_doc_collection_path)
    precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)
    precess_doc_collection.save(output_pre_doc_collection_path)
예제 #2
0
def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path
예제 #3
0
def build_pre_doc(pro_name, version, preprocessor):
    pre_way = "unknown-pre"
    if isinstance(preprocessor, SimplePreprocessor):
        pre_way = "sim-pre"
    if isinstance(preprocessor, SpacyTextPreprocessor):
        pre_way = "spacy-pre"
    if isinstance(preprocessor, CodeDocPreprocessor):
        pre_way = "code-pre"
    if isinstance(preprocessor, PureCodePreprocessor):
        pre_way = "pure-pre"

    input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way)

    builder = CodeGraphBuilder()
    builder.build_pre_doc(input_doc_collection_path, output_pre_doc_collection_path, preprocessor)
예제 #4
0
def build_extra_model_and_doc(pro_name, version_list):
    for version in version_list:
        preprocessors = [CodeDocPreprocessor()]
        pre_way = "code-pre"
        build_doc(pro_name=pro_name, version=version)
        for preprocessor in preprocessors:
            build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor)

        train_name_searcher(pro_name=pro_name, version=version)

        pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way)
        preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load(
            pre_doc_collection_path)

        word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
        AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                            doc_collection=preprocess_doc_collection)
예제 #5
0
                str(v) + ":" + str(k)
                for k, v in self.end_related_relation_num.items()
            ])


if __name__ == "__main__":
    domain_dir = PathUtil.domain_concept_dir("JabRef-2.6", version="v1")
    domain_dir = Path(domain_dir)
    term_save_path = str(domain_dir / "terms.txt")
    operation_save_path = str(domain_dir / "operations.txt")
    term_relation_save_path = str(domain_dir / "relations.json")
    linkage_save_path = str(domain_dir / "linkages.json")
    aliase_save_path = str(domain_dir / "aliases.json")

    pre_doc_collection_out_path = PathUtil.pre_doc(pro_name="JabRef-2.6",
                                                   version="v2",
                                                   pre_way="code-pre")

    reduce = ReduceDomainTerm(term_save_path, operation_save_path,
                              term_relation_save_path, linkage_save_path,
                              aliase_save_path, pre_doc_collection_out_path)
    delete_based_on_name = reduce.delete_based_on_name()
    print(delete_based_on_name)
    print(len(delete_based_on_name))
    delete_based_on_aliase_tf = reduce.delete_based_on_aliase_tf()
    print(delete_based_on_aliase_tf)
    print(len(delete_based_on_aliase_tf))
    delete_based_on_name_length = reduce.delete_based_on_name_length()
    print(delete_based_on_name_length)
    print(len(delete_based_on_name_length))