def train_model(pro_name, version, first_model_config, second_model_config): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) sub_search_model_config = [ (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=first_model_config[0]), first_model_config[1], first_model_config[2], False), (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=second_model_config[0]), second_model_config[1], second_model_config[2], True), ] compound_model_name = "compound_{base_model}+{extra_model}".format( base_model=first_model_config[0], extra_model=second_model_config[0]) print("try to model compound model for %r" % compound_model_name) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.train( model_dir_path=model_dir_path, doc_collection=doc_collection, sub_search_model_config=sub_search_model_config) return model_dir_path
def build_doc(pro_name, version): input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way="code-pre") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path)
def train_model(pro_name, version): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = Preprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="bm25") BM25Model.train(model_dir_path, doc_collection=doc_collection) return model_dir_path
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def build_pre_doc(pro_name, version, preprocessor): pre_way = "unknown-pre" if isinstance(preprocessor, SimplePreprocessor): pre_way = "sim-pre" if isinstance(preprocessor, SpacyTextPreprocessor): pre_way = "spacy-pre" if isinstance(preprocessor, CodeDocPreprocessor): pre_way = "code-pre" if isinstance(preprocessor, PureCodePreprocessor): pre_way = "pure-pre" input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) builder = CodeGraphBuilder() builder.build_pre_doc(input_doc_collection_path, output_pre_doc_collection_path, preprocessor)
def __init__(self, pro_name, version): self.model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="svm") self.model = FilterSemanticTFIDFNode2VectorModel( name="svm", model_dir_path=self.model_dir_path) self.document_collection_path = PathUtil.doc(pro_name, version) self.collection = MultiFieldDocumentCollection.load( str(self.document_collection_path)) self.processor = Preprocessor() self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( self.processor, self.collection) self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight="unweight") self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version) self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
def build_doc(pro_name, version): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) document_collection_path = PathUtil.doc(pro_name=pro_name, version=version) builder = CodeGraphBuilder() builder.build_doc(graph_data_path=graph_data_path, output_doc_collection_path=document_collection_path)