def __init__(self, graph_data): if isinstance(graph_data, GraphData): self.graph_data = graph_data elif isinstance(graph_data, Path): self.graph_data = GraphData.load(str(graph_data)) elif isinstance(graph_data, str): self.graph_data = GraphData.load(graph_data) else: self.graph_data = None self.graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) self.doc_collection = MultiFieldDocumentCollection()
def train_model(pro_name, version, first_model_config, second_model_config): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) sub_search_model_config = [ (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=first_model_config[0]), first_model_config[1], first_model_config[2], False), (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=second_model_config[0]), second_model_config[1], second_model_config[2], True), ] compound_model_name = "compound_{base_model}+{extra_model}".format( base_model=first_model_config[0], extra_model=second_model_config[0]) print("try to model compound model for %r" % compound_model_name) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.train( model_dir_path=model_dir_path, doc_collection=doc_collection, sub_search_model_config=sub_search_model_config) return model_dir_path
def load_doc(project_name="android27", version="v1"): """ project_name: jdk8 android27 """ document_collection_path = PathUtil.doc(pro_name=project_name, version=version) return MultiFieldDocumentCollection.load(document_collection_path)
def init(self, doc_collection): """ init from a exist doc collection :param doc_collection: could be a str pointing the path to MultiFieldDocumentCollection. or A exist MultiFieldDocumentCollection obj. :return: """ if doc_collection is None: raise Exception("init from None") if isinstance(doc_collection, MultiFieldDocumentCollection): self.doc_collection = doc_collection elif isinstance(doc_collection, Path): self.doc_collection = MultiFieldDocumentCollection.load(str(doc_collection)) elif isinstance(doc_collection, str): self.doc_collection = MultiFieldDocumentCollection.load(doc_collection) else: self.doc_collection = None print("init complete")
def __init__(self, graph_data_path, dc_file_location, concepts_path, relations_path): self.graph: GraphData = GraphData.load(graph_data_path) self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( dc_file_location) with open(concepts_path) as f: self.concepts_list = json.load(f) with open(relations_path) as f: self.relations_list = json.load(f) self.concept_2_node_id = {}
def build_doc(pro_name, version): input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way="code-pre") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path)
def build_doc_for_kg(self, output_path=None): """ build the doc for kg, only include aliases, out relation, description :return: """ self.clear() self.build_doc() sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if isinstance(node_info, CodeElementNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, DomainEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, OperationEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, WikidataEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) else: descriptions.append(doc.get_doc_text_by_field("short_description_sentences")) descriptions.append(doc.get_doc_text_by_field("out_relations")) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) if output_path is not None: sub_doc_collection.save(output_path) print("collection len{}".format(sub_doc_collection.get_num())) return sub_doc_collection
def train_model(pro_name, version): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = Preprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="bm25") BM25Model.train(model_dir_path, doc_collection=doc_collection) return model_dir_path
def build_pre_doc(self, input_doc_collection_path, output_pre_doc_collection_path, preprocessor=None): if preprocessor == None: preprocessor = CodeDocPreprocessor() print("stat preprocess doc - for %s %r " % (input_doc_collection_path, preprocessor)) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=preprocessor, doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path) print("end preprocess doc - %r %r " % (output_pre_doc_collection_path, preprocessor))
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def __init__(self, pro_name, version): self.model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="svm") self.model = FilterSemanticTFIDFNode2VectorModel( name="svm", model_dir_path=self.model_dir_path) self.document_collection_path = PathUtil.doc(pro_name, version) self.collection = MultiFieldDocumentCollection.load( str(self.document_collection_path)) self.processor = Preprocessor() self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( self.processor, self.collection) self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight="unweight") self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version) self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
def extract_kg_doc_collection(self, output_path): """ extract the necessary field of text as a new doc :param output_path: :return: """ sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if isinstance(node_info, CodeElementNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, DomainEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, OperationEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, WikidataEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) sub_doc_collection.save(output_path)
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
def extract_kg_doc_collection_with_method(self, output_path, **config): print("start building doc") no_out_relation = False no_in_relation = False no_jdk = False method = False if "no_out_relation" in config.keys(): no_out_relation = config["no_out_relation"] if "no_in_relation" in config.keys(): no_in_relation = config["no_in_relation"] if "no_jdk" in config.keys(): no_jdk = config["no_jdk"] if "with_method" in config.keys(): method = config["with_method"] print("*" * 20) print("no_in_relation %r, no_out_relation %r, no_jdk %r, with_method %r" % ( no_in_relation, no_out_relation, no_jdk, method)) sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if not method and not no_jdk: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if method and not no_jdk and "method" in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if not method and no_jdk and "jdk8" not in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if method and no_jdk and "method" in node_info.labels and "jdk8" not in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) sub_doc_collection.save(output_path)
from sekg.graph.exporter.graph_data import NodeInfo import json from definitions import OUTPUT_DIR from pathlib import Path pro_name = 'jabref' dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1') graph_data_file_location = PathUtil.graph_data(pro_name=pro_name, version='v1.8') dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1') comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json" qualified_name_json_file = Path( OUTPUT_DIR) / "json" / "mid_2_qualified_name.json" if __name__ == '__main__': doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( dc_file_location) graph_data: GraphData = GraphData.load(graph_data_file_location) comment_list = [] comments = open(comment_json_file, 'r').readlines() for line in comments: comment_list.append(json.loads(line)) qualified_name_list = [] names = open(qualified_name_json_file, 'r').readlines() for line in names: qualified_name_list.append(json.loads(line)) missing_count = 0 # 根据qualified name找到graph data对应节点的api_id, 然后通过这个api_id找到doc_collection中对应的doc, 插入field和相应信息 for item in qualified_name_list:
from flask_cors import CORS from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.knowledge_service import KnowledgeService from project.doc_service import DocService from project.json_service import JsonService from project.utils.path_util import PathUtil app = Flask(__name__) cors = CORS(app, resources={r"/*": {"origins": "*"}}) pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v1.2") graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.8") graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir) knowledge_service = KnowledgeService(doc_collection) doc_service = DocService() json_service = JsonService() @app.route('/') def hello(): return 'success' # search doc info according to method name @app.route('/get_doc/', methods=["GET", "POST"]) def doc_info(): if "qualified_name" not in request.json: return "qualified name need"
''' 将样例代码进行聚类划分并输出 ''' pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.9") doc_collection_path = PathUtil.doc(pro_name=pro_name, version="v3.2") doc_collection_save_path = PathUtil.doc(pro_name=pro_name, version="v3.3") api_to_example_json_path = Path( definitions.ROOT_DIR) / "output" / "json" / "api_2_example_sorted.json" mid_to_method_info_json_path = Path( definitions.ROOT_DIR ) / "output" / "json" / "mid_2_method_info_without_comment.json" graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( doc_collection_path) # 读取sample code文件. api_to_mid: 每个api对应的sample code的mid. methods_info: 每个mid对应的代码 with open(api_to_example_json_path, 'r') as f: api_to_mid = json.load(f) f.close() methods_info = list() methods = open(mid_to_method_info_json_path, 'r').readlines() for method in methods: methods_info.append(json.loads(method)['method']) # 根据qualified name查找得到doc文件 def find_doc(qualified_name): node: NodeInfo = graph_data.find_one_node_by_property( property_name='qualified_name', property_value=qualified_name)
def __init__(self): pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3") self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir)
class GraphNodeDocumentBuilder: """ build the basic Node Document from a exist NodeDocument """ def __init__(self, graph_data): if isinstance(graph_data, GraphData): self.graph_data = graph_data elif isinstance(graph_data, Path): self.graph_data = GraphData.load(str(graph_data)) elif isinstance(graph_data, str): self.graph_data = GraphData.load(graph_data) else: self.graph_data = None self.graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) self.doc_collection = MultiFieldDocumentCollection() def init(self, doc_collection): """ init from a exist doc collection :param doc_collection: could be a str pointing the path to MultiFieldDocumentCollection. or A exist MultiFieldDocumentCollection obj. :return: """ if doc_collection is None: raise Exception("init from None") if isinstance(doc_collection, MultiFieldDocumentCollection): self.doc_collection = doc_collection elif isinstance(doc_collection, Path): self.doc_collection = MultiFieldDocumentCollection.load(str(doc_collection)) elif isinstance(doc_collection, str): self.doc_collection = MultiFieldDocumentCollection.load(doc_collection) else: self.doc_collection = None print("init complete") def save(self, output_path): self.doc_collection.save(output_path) def build_doc_for_code_element(self, node_info: CodeElementNodeInfo): all_texts = [] properties = ["short_description", "string_literal_expr", "comment", "declare", "inside_comment"] for property_name in properties: if property_name not in node_info.properties: continue property_value = node_info.properties[property_name] if property_name in ["comment", "declare"]: str_added = None if type(property_value) == str: str_added = self.clean_comment(property_value) if type(property_value) == list: for value in property_value: str_added += self.clean_comment(value) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name=property_name, value=str_added) if type(property_value) == str: all_texts.append(self.clean_comment(property_value)) continue if type(property_value) == list: for value in property_value: all_texts.append(self.clean_comment(value)) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description", value=" . \n".join(all_texts)) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases", value=" \n ".join(node_info.get_all_names())) self.add_text_for_out_relation(node_info.node_id) self.add_text_for_in_relation(node_info.node_id) def add_text_for_out_relation(self, node_id): out_relation_infos = self.graph_data_reader.get_all_out_relation_infos(node_id=node_id) related_sentences_text = [] for relation_info in out_relation_infos: end_node_info = relation_info.end_node_info # relation_text = relation_info.relation_name + " " + end_node_info.get_main_name() relation_text = end_node_info.get_main_name() if relation_text is None: pass else: related_sentences_text.append(relation_text) join_relation_text = " .\n ".join(related_sentences_text) self.doc_collection.add_field_to_doc(doc_id=node_id, field_name="out_relations", value=join_relation_text) def add_text_for_in_relation(self, node_id): relation_infos = self.graph_data_reader.get_all_in_relation_infos(node_id=node_id) related_sentences_text = [] for relation_info in relation_infos: try: start_node_info = relation_info.start_node_info # relation_text = start_node_info.get_main_name() + " " + relation_info.relation_name relation_text = start_node_info.get_main_name() if relation_text is None: pass else: related_sentences_text.append(relation_text) except: print("add text error:%r" % start_node_info) traceback.print_exc() join_relation_text = " .\n ".join(related_sentences_text) self.doc_collection.add_field_to_doc(doc_id=node_id, field_name="in_relations", value=join_relation_text) def clean_comment(self, description): try: sent = BeautifulSoup(description, "lxml").get_text() return sent.strip().strip("/*").strip("//").strip("*").strip() except Exception: traceback.print_exc() return "" def build_doc_for_domain_entity(self, node_info: DomainEntityNodeInfo): mention_relation_type = ["mention in comment", "mention in inside comment", "mention in short description", "mention in string literal"] all_mention_relations = set() for relation_type in mention_relation_type: all_mention_relations.update(set(self.graph_data.get_relations(start_id=None, relation_type=relation_type, end_id=node_info.node_id))) all_mention_info = [] for s, r, e in all_mention_relations: if r.startswith("mention"): start_node_info = self.graph_data_reader.get_node_info(s) mention = None if r == mention_relation_type[0]: mention = start_node_info.properties.get("comment", "") elif r == mention_relation_type[1]: mention = start_node_info.properties.get("inside comment", "") elif r == mention_relation_type[2]: mention = start_node_info.properties.get("short_description", "") elif r == mention_relation_type[3]: mention = start_node_info.properties.get("string_literal_expr", "") if isinstance(mention, list): all_mention_info.extend(mention) else: all_mention_info.append(mention) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases", value="\n".join(node_info.get_all_names())) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description", value=node_info.properties.get("descriptions_en", "") + " ".join( all_mention_info)) self.add_text_for_out_relation(node_info.node_id) self.add_text_for_in_relation(node_info.node_id) def clear(self): self.doc_collection = MultiFieldDocumentCollection() def build_doc(self): self.clear() print("start building doc") for id in self.graph_data.get_node_ids(): node_info = self.graph_data_reader.get_node_info(id) try: if node_info is None: continue # if node_info.get_main_name() is None: # continue doc = MultiFieldDocument(id=node_info.node_id, name=node_info.get_main_name()) self.doc_collection.add_document(doc) if "code" in node_info.properties.keys(): data = node_info.properties["code"] self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="doc", value=data) if isinstance(node_info, CodeElementNodeInfo): self.build_doc_for_code_element(node_info) elif isinstance(node_info, DomainEntityNodeInfo): self.build_doc_for_domain_entity(node_info) elif isinstance(node_info, OperationEntityNodeInfo): self.build_doc_for_operation_entity(node_info) elif isinstance(node_info, WikidataEntityNodeInfo): self.build_doc_for_wikidata_entity(node_info) else: self.build_doc_for_sentence(node_info) except: traceback.print_exc() print("build doc error %r" % node_info) print("end building doc") def build_doc_with_pure_code(self): print("start building doc") for id in self.graph_data.get_node_ids(): node_info = self.graph_data_reader.get_node_info(id) try: if node_info is None: continue if node_info.get_main_name() is None: continue if "code" in node_info.properties.keys(): doc = MultiFieldDocument(id=node_info.node_id, name=node_info.get_main_name()) self.doc_collection.add_document(doc) # processor = PureCodePreprocessor() # data = " ".join(processor.clean(node_info.properties["code"])) data = node_info.properties["code"] self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="doc", value=data) except: traceback.print_exc() print("build doc error %r" % node_info) print("end building doc") def extract_kg_doc_collection_with_method(self, output_path, **config): print("start building doc") no_out_relation = False no_in_relation = False no_jdk = False method = False if "no_out_relation" in config.keys(): no_out_relation = config["no_out_relation"] if "no_in_relation" in config.keys(): no_in_relation = config["no_in_relation"] if "no_jdk" in config.keys(): no_jdk = config["no_jdk"] if "with_method" in config.keys(): method = config["with_method"] print("*" * 20) print("no_in_relation %r, no_out_relation %r, no_jdk %r, with_method %r" % ( no_in_relation, no_out_relation, no_jdk, method)) sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if not method and not no_jdk: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if method and not no_jdk and "method" in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if not method and no_jdk and "jdk8" not in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) if method and no_jdk and "method" in node_info.labels and "jdk8" not in node_info.labels: descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) sub_doc_collection.save(output_path) def add_description(self, node_info, doc, no_out_relation, no_in_relation=False): descriptions = [] if isinstance(node_info, CodeElementNodeInfo): descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation) if isinstance(node_info, DomainEntityNodeInfo): descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation) if isinstance(node_info, OperationEntityNodeInfo): descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation) if isinstance(node_info, WikidataEntityNodeInfo): descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation) return descriptions def add_one_description(self, node_info, doc, no_out_relation, no_in_relation): descriptions = [] descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("description")) # descriptions.append(doc.get_doc_text_by_field("declare")) # descriptions.append(doc.get_doc_text_by_field("comment")) if no_out_relation: pass else: descriptions.append(doc.get_doc_text_by_field("out_relations")) if no_in_relation: pass else: descriptions.append(doc.get_doc_text_by_field("in_relations")) return descriptions def extract_kg_doc_collection(self, output_path): """ extract the necessary field of text as a new doc :param output_path: :return: """ sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if isinstance(node_info, CodeElementNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, DomainEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, OperationEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) if isinstance(node_info, WikidataEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) sub_doc_collection.save(output_path) def build_doc_for_kg(self, output_path=None): """ build the doc for kg, only include aliases, out relation, description :return: """ self.clear() self.build_doc() sub_doc_collection = MultiFieldDocumentCollection() graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) fail_count = 0 for id in self.graph_data.get_node_ids(): node_info = graph_data_reader.get_node_info(id) doc = self.doc_collection.get_by_id(id) if doc is None: fail_count = fail_count + 1 continue new_doc = MultiFieldDocument(id=doc.id, name=doc.name) descriptions = [] if isinstance(node_info, CodeElementNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, DomainEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, OperationEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) elif isinstance(node_info, WikidataEntityNodeInfo): descriptions.append(node_info.get_main_name()) descriptions.append(doc.get_doc_text_by_field("aliases")) descriptions.append(doc.get_doc_text_by_field("out_relations")) descriptions.append(doc.get_doc_text_by_field("description")) else: descriptions.append(doc.get_doc_text_by_field("short_description_sentences")) descriptions.append(doc.get_doc_text_by_field("out_relations")) description = "\n".join([text for text in descriptions if text]) new_doc.add_field("doc", description) if id % 2000 == 0: print("doc:%r" % description) sub_doc_collection.add_document(new_doc) if output_path is not None: sub_doc_collection.save(output_path) print("collection len{}".format(sub_doc_collection.get_num())) return sub_doc_collection def build_doc_for_sentence(self, node_info): self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="short_description_sentences", value=node_info.properties["sentence_name"]) self.add_text_for_out_relation(node_info.node_id) self.add_text_for_in_relation(node_info.node_id) def build_doc_for_operation_entity(self, node_info): self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases", value="\n".join(node_info.get_all_names())) self.add_text_for_out_relation(node_info.node_id) self.add_text_for_in_relation(node_info.node_id) def build_doc_for_wikidata_entity(self, node_info): self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases", value="\n".join(node_info.get_all_names())) self.add_text_for_out_relation(node_info.node_id) self.add_text_for_in_relation(node_info.node_id) self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description", value=node_info.properties.get("descriptions_en", ""))
def clear(self): self.doc_collection = MultiFieldDocumentCollection()