def load_graph_data(is_jdk=True, version="v1"): if is_jdk: graph_data_path = PathUtil.jdk_graph_data(version) else: graph_data_path = PathUtil.android_graph_data(version) return GraphData.load(graph_data_path)
def load_doc(project_name="android27", version="v1"): """ project_name: jdk8 android27 """ document_collection_path = PathUtil.doc(pro_name=project_name, version=version) return MultiFieldDocumentCollection.load(document_collection_path)
def __init__(self, doc_collection): graph_data_path = PathUtil.graph_data(pro_name="jabref", version="v3.7") self.graph_data = GraphData.load(graph_data_path) self.doc_collection = doc_collection self.entity_words = set() self.entity_2_score = dict() self.counter = 0 self.entity_path = str(Path(OUTPUT_DIR) / "entity.json")
def __init__(self, doc_collection, graph_data_path=PathUtil.graph_data(pro_name="jabref", version="v3.10")): if isinstance(graph_data_path, GraphData): self.graph_data: GraphData = graph_data_path else: self.graph_data: GraphData = GraphData.load(graph_data_path) self.doc_collection = doc_collection self.functionClassifier = FastTextClassifier() self.G = nx.Graph(self.graph_data.graph)
def __init__(self, input_graph_version): self.save_expand_res_path = str( Path(OUTPUT_DIR) / "prefix_suffix_relations.pickle") self.api_id_2_record_text_path = str( Path(OUTPUT_DIR) / "api_id_2_record.pickle") self.api_id_2_record_text = Tool.load_pickle( self.api_id_2_record_text_path) graph_data_path = PathUtil.graph_data(pro_name="jabref", version=input_graph_version) self.graph_data = GraphData.load(graph_data_path) self.func_relation_set = { RelationNameConstant.has_Functionality_Relation, RelationNameConstant.Functionality_Compare_Relation, RelationNameConstant.has_Behavior_Relation, } self.concept_classification = { RelationNameConstant.Ontology_IS_A_Relation, } self.membership = { RelationNameConstant.Ontology_Derive_Relation, } self.characteristic = { RelationNameConstant.has_Feature_Relation, RelationNameConstant.has_Constraint_Relation, } self.category_name_2_id = dict() self.type_of_class = { CodeEntityCategory.CATEGORY_CLASS, CodeEntityCategory.CATEGORY_INTERFACE, CodeEntityCategory.CATEGORY_EXCEPTION_CLASS, CodeEntityCategory.CATEGORY_ERROR_CLASS, CodeEntityCategory.CATEGORY_ENUM_CLASS, CodeEntityCategory.CATEGORY_ANNOTATION_CLASS } self.type_of_method = { CodeEntityCategory.CATEGORY_METHOD, CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD, CodeEntityCategory.CATEGORY_BASE_OVERRIDE_METHOD, } self.CODE_NAME_UTIL = CodeElementNameUtil()
from sekg.graph.exporter.graph_data import NodeInfo, GraphData from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument from project.utils.path_util import PathUtil pro_name = "jabref" doc_path = PathUtil.doc(pro_name=pro_name, version="v1") graph_data_path = PathUtil.graph_data(pro_name="jabref", version="v1") graph_data = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( doc_path) # e.g. org.jabref.model.metadata.event.MetaDataChangedEvent api_name = "org.jabref.model.metadata.event.MetaDataChangedEvent" node = graph_data.find_one_node_by_property(property_name='qualified_name', property_value=api_name) api_id = node["id"] doc: MultiFieldDocument = doc_collection.get_by_id(api_id) return_data = dict() return_data['doc_info'] = dict() return_data['api_name'] = api_name return_data['doc_info']['full_html_description'] = doc.get_doc_text_by_field( 'full_html_description') return_data['doc_info']['full_description'] = doc.get_doc_text_by_field( 'full_description') return_data['doc_info']['sentence_description'] = doc.get_doc_text_by_field( 'sentence_description') print(return_data)
def __init__(self): pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3") self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir)
from project.utils.path_util import PathUtil from sekg.graph.exporter.graph_data import GraphData, NodeInfo from sekg.ir.doc.wrapper import MultiFieldDocument, MultiFieldDocumentCollection import json import definitions from pathlib import Path pro_name = 'jabref' graph_data_path = PathUtil.graph_data(pro_name=pro_name, version='v3.9') doc_collection_path = PathUtil.doc(pro_name=pro_name, version='v3.1') doc_collection_save_path = PathUtil.doc(pro_name=pro_name, version='v3.2') api_to_example_json_path = Path( definitions.ROOT_DIR) / "output" / "json" / "api_2_example_sorted.json" mid_to_method_info_json_path = Path( definitions.ROOT_DIR ) / "output" / "json" / "mid_2_method_info_without_comment.json" graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( doc_collection_path) ''' doc文件抽取样例代码 ''' def find_doc(qualified_name): node: NodeInfo = graph_data.find_one_node_by_property( property_name='qualified_name', property_value=qualified_name) if node is None: node: NodeInfo = graph_data.find_one_node_by_property_value_starts_with( property_name='qualified_name', property_value_starter=qualified_name)
from project.utils.path_util import PathUtil from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument from sekg.graph.exporter.graph_data import GraphData from sekg.graph.exporter.graph_data import NodeInfo import json from definitions import OUTPUT_DIR from pathlib import Path pro_name = 'jabref' dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1') graph_data_file_location = PathUtil.graph_data(pro_name=pro_name, version='v1.8') dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1') comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json" qualified_name_json_file = Path( OUTPUT_DIR) / "json" / "mid_2_qualified_name.json" if __name__ == '__main__': doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( dc_file_location) graph_data: GraphData = GraphData.load(graph_data_file_location) comment_list = [] comments = open(comment_json_file, 'r').readlines() for line in comments: comment_list.append(json.loads(line)) qualified_name_list = [] names = open(qualified_name_json_file, 'r').readlines() for line in names: qualified_name_list.append(json.loads(line))
from flask import Flask, request, jsonify from flask_cors import CORS from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.knowledge_service import KnowledgeService from project.doc_service import DocService from project.json_service import JsonService from project.utils.path_util import PathUtil app = Flask(__name__) cors = CORS(app, resources={r"/*": {"origins": "*"}}) pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v1.2") graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.8") graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir) knowledge_service = KnowledgeService(doc_collection) doc_service = DocService() json_service = JsonService() @app.route('/') def hello(): return 'success' # search doc info according to method name @app.route('/get_doc/', methods=["GET", "POST"]) def doc_info():
node_id) if node_doc: full_description = node_doc.get_doc_text_by_field( 'full_description') for concept_list_item in self.concepts_list: concept_node_id = -1 for concept in concept_list_item: if concept in self.concept_2_node_id: concept_node_id = self.concept_2_node_id[concept] break if concept_node_id >= 0: for concept in concept_list_item: if concept in full_description: self.graph.add_relation( node_id, "has concept", concept_node_id) break print("relation添加完毕") if __name__ == "__main__": concept_and_relation_path = Path(DATA_DIR) / "concept_and_relation" concept_2_graph = Concept2Graph( PathUtil.graph_data("jabref", "v3.8"), PathUtil.doc(pro_name="jabref", version='v3.3'), str(concept_and_relation_path / "concepts.json"), str(concept_and_relation_path / "relations.json")) concept_2_graph.add_concept_2_graph() concept_2_graph.add_relation_2_graph() concept_2_graph.graph.save(PathUtil.graph_data("jabref", "v3.9")) print("图导入完成")
from pathlib import Path from sekg.pipeline.base import KGBuildPipeline from definitions import OUTPUT_DIR from project.extractor_module.structure_extractor.characteristic_structure_extractor import CharacteristicStructureExtractor from project.utils.path_util import PathUtil if __name__ == '__main__': pipeline = KGBuildPipeline() pro_name = "jabref" graph_data_v1_path = PathUtil.graph_data(pro_name=pro_name, version="v1") pipeline.load_graph(graph_data_v1_path) component1 = CharacteristicStructureExtractor() component1.set_json_save_path(Path(OUTPUT_DIR) / "json" / "name_characteristic.json") component1.set_save_path(PathUtil.graph_data(pro_name=pro_name, version="v1.1")) pipeline.add_component("从名称和结构中抽取特征", component1) pipeline.run()
label_info = {"entity"} type_class = self.get_record_entity_type_by_relation(statement.r_name) label_info.add(str(type_class.LABEL)) label_info.add(str("statement")) node_properties = { type_class.PRIMARY_PROPERTY_NAME: statement.e_name, } for extra_info_key in statement.extra_info: node_properties[extra_info_key] = statement.extra_info[ extra_info_key] node_properties["which_extractor"] = statement.which_extractor node_properties["e_type"] = statement.e_type node_properties["s_name"] = statement.s_name node_properties["r_name"] = statement.r_name graph_id = self.graph_data.add_node( label_info, node_properties, primary_property_name=type_class.PRIMARY_PROPERTY_NAME) return graph_id if __name__ == '__main__': start_time = time.asctime(time.localtime(time.time())) print(start_time) api_diff_graph_builder = APIDiffGraphBuilder(input_graph_version="v1.4") api_diff_graph_builder.build_simple_graph() api_diff_graph_builder.graph_data.save( PathUtil.graph_data(pro_name="jabref", version="v1.5")) end_time = time.asctime(time.localtime(time.time())) print(end_time)
return "No sample code available." sample_code = doc.get_doc_text_by_field('sample_code') if len(sample_code) == 0 or sample_code is None: return "No sample code available." else: return sample_code[0][2:] # 返回相关api def get_related_api(self, qualified_name): result = dict() api_id = self.get_api_id_by_name(qualified_name) node: NodeInfo = self.graph_data.find_nodes_by_ids(api_id)[0] related_api = list() related_api_simplified = list() related_api = node['properties']['simrank'] for i in related_api: related_api_simplified.append(i[i.rfind('.') + 1:]) result['related_api'] = related_api result['related_api_simplified'] = related_api_simplified return result if __name__ == '__main__': pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir) knowledge_service = KnowledgeService(doc_collection) print(knowledge_service.get_related_api("org.jabref.model.entry.BibEntry"))
from sekg.ir.doc.wrapper import MultiFieldDocumentCollection from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.knowledge_service import KnowledgeService from project.doc_service import DocService from project.json_service import JsonService from project.utils.path_util import PathUtil from pathlib import Path import definitions import json app = Flask(__name__) cors = CORS(app, resources={r"/*": {"origins": "*"}}) pro_name = "jabref" doc_dir = PathUtil.doc(pro_name=pro_name, version="v3.3") graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.10") graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( doc_dir) simple_qualified_name_map_path = Path( definitions.ROOT_DIR) / "output" / "simple_qualified_name_map.json" knowledge_service = KnowledgeService(doc_collection, graph_data) doc_service = DocService() json_service = JsonService() with open(simple_qualified_name_map_path, 'r') as f: json_str = f.read() simple_qualified_name_map = json.loads(json_str) print("load complete")
from project.utils.path_util import PathUtil from pathlib import Path from sekg.graph.exporter.graph_data import GraphData, NodeInfo from sekg.ir.doc.wrapper import MultiFieldDocument, MultiFieldDocumentCollection from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.cluster import KMeans import definitions import json ''' 将样例代码进行聚类划分并输出 ''' pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.9") doc_collection_path = PathUtil.doc(pro_name=pro_name, version="v3.2") doc_collection_save_path = PathUtil.doc(pro_name=pro_name, version="v3.3") api_to_example_json_path = Path( definitions.ROOT_DIR) / "output" / "json" / "api_2_example_sorted.json" mid_to_method_info_json_path = Path( definitions.ROOT_DIR ) / "output" / "json" / "mid_2_method_info_without_comment.json" graph_data: GraphData = GraphData.load(graph_data_path) doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( doc_collection_path) # 读取sample code文件. api_to_mid: 每个api对应的sample code的mid. methods_info: 每个mid对应的代码 with open(api_to_example_json_path, 'r') as f: api_to_mid = json.load(f) f.close()
from project.utils.path_util import PathUtil from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.classification_module import method_classification from nltk.corpus import wordnet as wn from project.classification_module.method_classification import split if __name__ == '__main__': # 1. 得到图中所有方法节点 2. qualified_name传入classification中做判断 pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.6") graph_data: GraphData = GraphData.load(graph_data_path) graph_data_output_path = PathUtil.graph_data(pro_name=pro_name, version='v1.7') methods_id: set = graph_data.get_node_ids_by_label("method") nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} verbs = {x.name().split('.', 1)[0] for x in wn.all_synsets('v')} count = [0, 0, 0, 0, 0] for i in iter(methods_id): node: NodeInfo = graph_data.find_nodes_by_ids(i)[0] qualified_name = node['properties']['qualified_name'] label = method_classification.basic_classification(qualified_name) if label is "undefined": first_word = split(camel_case=qualified_name) if first_word in verbs: label = "mutator" if first_word in nouns: label = "accessor" if label is "accessor": count[0] += 1
import networkx as nx from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.utils.path_util import PathUtil if __name__ == '__main__': pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.3") output_path = PathUtil.graph_data(pro_name=pro_name, version="v1.4") graph_data: GraphData = GraphData.load(graph_data_path) nx_graph = nx.Graph(graph_data.graph) result = nx.pagerank(nx_graph) # todo: result是一个字典格式{node_id: pr_value},将其插入graph相应节点的node['properties']['pr_value']中 for i in range(41167): node: NodeInfo = graph_data.find_nodes_by_ids(i + 1)[0] node["properties"]["pr_value"] = result[i + 1] graph_data.save(output_path)
""" 对方法进行分类 将方法分为: accessor, mutator, creational, constructor, undefined五类 """ from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.utils.path_util import PathUtil from nltk.corpus import wordnet as wn pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.4") graph_data: GraphData = GraphData.load(graph_data_path) accessor_key_word = ("get", "toString", "find", "search", "test", "contains", "is", "has", "show") mutator_key_word = ("set", "add", "delete", "move", "remove", "parse", "insert", "extract", "open") creational_key_word = ("copy", "construct", "create") nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} verbs = {x.name().split('.', 1)[0] for x in wn.all_synsets('v')} def get_pure_method_name_without_parameter(qualified_name=None): if qualified_name is None or qualified_name is "": raise ValueError("qualified name needed") qualified_name = qualified_name[:qualified_name.find("(")] result = qualified_name[qualified_name.rfind(".")+1:] return result # 根据一系列的key word去做最基本的划分
from project.utils.path_util import PathUtil from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.classification_module import method_classification from nltk.corpus import wordnet as wn from project.classification_module.method_classification import split ''' 对method进行分类 ''' if __name__ == '__main__': # 1. 得到图中所有方法节点 2. qualified_name传入classification中做判断 pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.4") graph_data: GraphData = GraphData.load(graph_data_path) graph_data_output_path = PathUtil.graph_data(pro_name=pro_name, version='v3.5') methods_id: set = graph_data.get_node_ids_by_label("method") nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} verbs = {x.name().split('.', 1)[0] for x in wn.all_synsets('v')} count = [0, 0, 0, 0, 0] for i in iter(methods_id): node: NodeInfo = graph_data.find_nodes_by_ids(i)[0] qualified_name = node['properties']['qualified_name'] label = method_classification.basic_classification(qualified_name) if label is "undefined": first_word = split(camel_case=qualified_name) if first_word in verbs: label = "mutator" if first_word in nouns: label = "accessor"
from pathlib import Path from sekg.pipeline.base import KGBuildPipeline from definitions import OUTPUT_DIR from project.extractor_module.structure_extractor.category_structure_extractor import CategoryStructureExtractor from project.extractor_module.structure_extractor.characteristic_structure_extractor import CharacteristicStructureExtractor from project.extractor_module.structure_extractor.func_name_extractor import FuncNameExtractor from project.utils.path_util import PathUtil if __name__ == '__main__': pipeline = KGBuildPipeline() pro_name = "jabref" graph_data_v1_path = PathUtil.graph_data(pro_name=pro_name, version="v1") pipeline.load_graph(graph_data_v1_path) component1 = CharacteristicStructureExtractor() component1.set_json_save_path( Path(OUTPUT_DIR) / "json" / "name_characteristic.json") component1.set_save_path( PathUtil.graph_data(pro_name=pro_name, version="v1.1")) pipeline.add_component("从名称和结构中抽取特征", component1) pipeline.run() pipeline = KGBuildPipeline() pro_name = "jabref" graph_data_v1_path = PathUtil.graph_data(pro_name=pro_name, version="v1.1") pipeline.load_graph(graph_data_v1_path) component1 = FuncNameExtractor() component1.set_json_save_path( Path(OUTPUT_DIR) / "json" / "name_functionality.json") component1.set_save_path(
print(self.counter) self.graph_data.add_relation(start_id, relation_str, end_id) except Exception as e: print(e) def load_entity_words(self): load_dict = self.load_json(self.entity_path) for each in load_dict: self.entity_words.add(each["entity_name"]) self.entity_2_score[each["entity_name"]] = each["tf_idf"] def load_json(self, path): with open(path, "r") as load_f: load_dict = json.load(load_f) return load_dict def save_graph(self, output_path): self.graph_data.save(output_path) if __name__ == '__main__': pro_name = "jabref" data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( data_dir) entity_service = EntityService(doc_collection) entity_service.link_all_api_entity() entity_service.save_graph( str(PathUtil.graph_data(pro_name="jabref", version="v3.8"))) print("counter:" + str(entity_service.counter))