Пример #1
0
 def __init__(self,
              filter_score=DEFAULT_FILTER_CONTEXT_SCORE,
              proxy_server=DEFAULT_PROXY_SERVER):
     self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
     self.wikipedia_cache = {}
     self.fetcher = AsyncWikiSearcher(proxy_server)
     self.graph_data = GraphData()
     self.wikidata_property_table = WikiDataPropertyTable.get_instance()
     self.embedding = {}
     self.filter_score = filter_score
     self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline()
     self.all_domain_vector = {}
Пример #2
0
    def __init__(self, graph_data):
        if isinstance(graph_data, GraphData):
            self.graph_data = graph_data
        elif isinstance(graph_data, Path):
            self.graph_data = GraphData.load(str(graph_data))
        elif isinstance(graph_data, str):
            self.graph_data = GraphData.load(graph_data)
        else:
            self.graph_data = None

        self.graph_data_reader = GraphDataReader(graph_data=self.graph_data,
                                                 node_info_factory=ProjectKGNodeInfoFactory())

        self.doc_collection = MultiFieldDocumentCollection()
Пример #3
0
    def load_graph_data(is_jdk=True, version="v1"):
        if is_jdk:
            graph_data_path = PathUtil.jdk_graph_data(version)
        else:
            graph_data_path = PathUtil.android_graph_data(version)

        return GraphData.load(graph_data_path)
Пример #4
0
def kg_impoter(path):
    graph_client = GRAPH_FACTORY.create_py2neo_graph_by_server_name(
        server_name="SOSampleCodeKG")
    accessor = GraphAccessor(graph_client)
    importer = Neo4jImporter(accessor)

    # graph_data_path = str(Path(GRAPH_DATA_DIR) / 'jdk8.v5.graph')
    graph_data: GraphData = GraphData.load(path)
    importer.import_all_graph_data(graph_data)
Пример #5
0
 def __init__(self, doc_collection):
     graph_data_path = PathUtil.graph_data(pro_name="jabref",
                                           version="v3.7")
     self.graph_data = GraphData.load(graph_data_path)
     self.doc_collection = doc_collection
     self.entity_words = set()
     self.entity_2_score = dict()
     self.counter = 0
     self.entity_path = str(Path(OUTPUT_DIR) / "entity.json")
Пример #6
0
 def __init__(self, graph_data_path, dc_file_location, concepts_path,
              relations_path):
     self.graph: GraphData = GraphData.load(graph_data_path)
     self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
         dc_file_location)
     with open(concepts_path) as f:
         self.concepts_list = json.load(f)
     with open(relations_path) as f:
         self.relations_list = json.load(f)
     self.concept_2_node_id = {}
Пример #7
0
 def __init__(self,
              doc_collection,
              graph_data_path=PathUtil.graph_data(pro_name="jabref",
                                                  version="v3.10")):
     if isinstance(graph_data_path, GraphData):
         self.graph_data: GraphData = graph_data_path
     else:
         self.graph_data: GraphData = GraphData.load(graph_data_path)
     self.doc_collection = doc_collection
     self.functionClassifier = FastTextClassifier()
     self.G = nx.Graph(self.graph_data.graph)
Пример #8
0
def train_weight_graph_data(graph_data_output_dir, node2vec_output_dir,
                            pro_name, version):
    graph_random_walk_path = str(
        node2vec_output_dir /
        "{pro}.{version}.weight.rwp".format(pro=pro_name, version=version))
    trainer = GraphNode2VecTrainer(
        GraphData.load(
            str(graph_data_output_dir / ("{pro}.{version}.graph".format(
                pro=pro_name, version=version)))))
    trainer.init_weight_graph(weight=True)
    trainer.generate_random_path(rw_path_store_path=graph_random_walk_path)
    graph2vec_model_path = str(node2vec_output_dir /
                               "{pro}.{version}.weight.node2vec".format(
                                   pro=pro_name, version=version))
    GraphNode2VecTrainer.train(rw_path_store_path=graph_random_walk_path,
                               model_path=graph2vec_model_path,
                               dimensions=100)
Пример #9
0
def train_node2vec(pro_name, version):
    print("train node2vec for %s at version %s" % (pro_name, version))
    graph_data_output_dir = Path(OUTPUT_DIR) / "graph" / pro_name
    graph_data_output_dir.mkdir(exist_ok=True, parents=True)
    node2vec_output_dir = graph_data_output_dir / "GraphEmbedding"
    node2vec_output_dir.mkdir(exist_ok=True, parents=True)
    graph_random_walk_path = str(
        node2vec_output_dir /
        "{pro}.{version}.unweight.rwp".format(pro=pro_name, version=version))
    trainer = GraphNode2VecTrainer(
        GraphData.load(
            str(graph_data_output_dir / ("{pro}.{version}.graph".format(
                pro=pro_name, version=version)))))
    trainer.init_unweight_graph()
    trainer.generate_random_path(rw_path_store_path=graph_random_walk_path)
    graph2vec_model_path = str(node2vec_output_dir /
                               "{pro}.{version}.unweight.node2vec".format(
                                   pro=pro_name, version=version))
    GraphNode2VecTrainer.train(rw_path_store_path=graph_random_walk_path,
                               model_path=graph2vec_model_path,
                               dimensions=100)
Пример #10
0
 def __init__(self, input_graph_version):
     self.save_expand_res_path = str(
         Path(OUTPUT_DIR) / "prefix_suffix_relations.pickle")
     self.api_id_2_record_text_path = str(
         Path(OUTPUT_DIR) / "api_id_2_record.pickle")
     self.api_id_2_record_text = Tool.load_pickle(
         self.api_id_2_record_text_path)
     graph_data_path = PathUtil.graph_data(pro_name="jabref",
                                           version=input_graph_version)
     self.graph_data = GraphData.load(graph_data_path)
     self.func_relation_set = {
         RelationNameConstant.has_Functionality_Relation,
         RelationNameConstant.Functionality_Compare_Relation,
         RelationNameConstant.has_Behavior_Relation,
     }
     self.concept_classification = {
         RelationNameConstant.Ontology_IS_A_Relation,
     }
     self.membership = {
         RelationNameConstant.Ontology_Derive_Relation,
     }
     self.characteristic = {
         RelationNameConstant.has_Feature_Relation,
         RelationNameConstant.has_Constraint_Relation,
     }
     self.category_name_2_id = dict()
     self.type_of_class = {
         CodeEntityCategory.CATEGORY_CLASS,
         CodeEntityCategory.CATEGORY_INTERFACE,
         CodeEntityCategory.CATEGORY_EXCEPTION_CLASS,
         CodeEntityCategory.CATEGORY_ERROR_CLASS,
         CodeEntityCategory.CATEGORY_ENUM_CLASS,
         CodeEntityCategory.CATEGORY_ANNOTATION_CLASS
     }
     self.type_of_method = {
         CodeEntityCategory.CATEGORY_METHOD,
         CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD,
         CodeEntityCategory.CATEGORY_BASE_OVERRIDE_METHOD,
     }
     self.CODE_NAME_UTIL = CodeElementNameUtil()
def build_v2_graph_for_pro(pro_name):

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1")
    graph_data: GraphData = GraphData.load(graph_data_path)
    new_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2")
    res = ExtractResultImport(graph_data, new_graph_data_path, 2)

    data_dir = Path(OUTPUT_DIR) / "graph" / "jdk8" / "filter_data"
    data_dir.mkdir(parents=True, exist_ok=True)
    filter_sentence_path = str(data_dir / "filter_sentence.txt")

    pat = re.compile('<[^>]+>', re.S)

    print("start to add sentences...")
    for id in graph_data.get_node_ids():
        node_info = graph_data.get_node_info_dict(id)
        short_description = node_info["properties"].get(
            "short_description", "")
        if not short_description:
            continue

        short_description = pat.sub('', short_description)
        short_descs = sent_tokenize(short_description)

        for short_desc in short_descs:
            short_desc = " ".join(short_desc.split())
            str_rm_sign = classifier.preprocessor.remove_sign(short_desc)
            text = classifier.preprocessor.remove_stop_words(str_rm_sign)
            label = list(classifier.predict(text))[0]
            if label == "0":
                print(short_desc)
                with open(filter_sentence_path, "a", encoding='utf-8') as f:
                    f.write(short_desc)
                    f.write("\n")
                continue
            else:
                res.add_sentence_relation(short_desc, id, int(label))
    res.save_new_graph_data()
Пример #12
0
 def init_graph_data(self, graph_data_path):
     self.graph_data = GraphData.load(graph_data_path)
     self.code_element_kg_builder = CodeElementGraphDataBuilder(
         self.graph_data)
Пример #13
0
"""

对方法进行分类
将方法分为: accessor, mutator, creational, constructor, undefined五类

"""

from sekg.graph.exporter.graph_data import GraphData, NodeInfo
from project.utils.path_util import PathUtil
from nltk.corpus import wordnet as wn

pro_name = "jabref"
graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.4")
graph_data: GraphData = GraphData.load(graph_data_path)

accessor_key_word = ("get", "toString", "find", "search", "test", "contains", "is", "has", "show")
mutator_key_word = ("set", "add", "delete", "move", "remove", "parse", "insert", "extract", "open")
creational_key_word = ("copy", "construct", "create")
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
verbs = {x.name().split('.', 1)[0] for x in wn.all_synsets('v')}


def get_pure_method_name_without_parameter(qualified_name=None):
    if qualified_name is None or qualified_name is "":
        raise ValueError("qualified name needed")
    qualified_name = qualified_name[:qualified_name.find("(")]
    result = qualified_name[qualified_name.rfind(".")+1:]
    return result


# 根据一系列的key word去做最基本的划分
Пример #14
0
class JDKKGBuilder:
    """
    build the skeleton KG from the JavaParser analysis result for the Project Source Code.
    It will include the package, class, interface, method.
    """
    def __init__(self):
        self.graph_data = GraphData()
        self.code_element_kg_builder = CodeElementGraphDataBuilder(
            self.graph_data)

    def init_graph_data(self, graph_data_path):
        self.graph_data = GraphData.load(graph_data_path)
        self.code_element_kg_builder = CodeElementGraphDataBuilder(
            self.graph_data)

    def import_primary_type(self):

        type_list = CodeEntityCategory.java_primary_types()

        for item in type_list:
            code_element = {
                "qualified_name": item["name"],
                "api_type": CodeEntityCategory.CATEGORY_PRIMARY_TYPE,
                "short_description": item["description"]
            }
            self.add_primary_type(item["name"], **code_element)

        print(self.graph_data)
        a = self.graph_data
        self.graph_data.print_label_count()

    def add_primary_type(self, primary_type_name, **properties):
        properties["qualified_name"] = primary_type_name

        cate_labels = CodeEntityCategory.to_str_list(
            CodeEntityCategory.CATEGORY_PRIMARY_TYPE)
        builder = NodeBuilder()
        builder = builder.add_property(
            **properties).add_entity_label().add_labels(
                "code_element", *cate_labels)
        node_id = self.graph_data.add_node(
            node_id=GraphData.UNASSIGNED_NODE_ID,
            node_labels=builder.get_labels(),
            node_properties=builder.get_properties(),
            primary_property_name="qualified_name")
        return node_id

    def build_aliases(self):
        self.code_element_kg_builder.build_aliases_for_code_element()

    def infer_extra_relation(self):
        self.code_element_kg_builder.build_belong_to_relation()
        self.code_element_kg_builder.build_abstract_overloading_relation()
        # self.code_element_kg_builder.build_value_subclass_relation()
        self.code_element_kg_builder.build_belong_to_relation()
        self.code_element_kg_builder.build_override_relation()

    def save(self, graph_data_path):
        self.graph_data.save(graph_data_path)

    def import_normal_entity(self, api_entity_json):

        format_qualified_name = self.code_element_kg_builder.format_qualified_name(
            api_entity_json["qualified_name"])

        if not format_qualified_name:
            return
        api_entity_json.pop("qualified_name")
        node_id = self.code_element_kg_builder.add_normal_code_element_entity(
            format_qualified_name, api_entity_json["api_type"],
            **api_entity_json)
        return node_id

    def import_parameter_entity(self, api_entity_json):
        extra_properties = {}

        qualified_name = api_entity_json["qualified_name"]
        short_description = api_entity_json["short_description"]

        value_type = qualified_name.split(" ")[0].strip()
        value_name = qualified_name.split(" ")[1].strip()
        ## todo: add all class node first, in case adding the parameter node without type info
        node_id = self.code_element_kg_builder.add_base_value_entity_node(
            value_type=value_type,
            value_name=value_name,
            short_description=short_description,
            entity_category=CodeEntityCategory.CATEGORY_PARAMETER,
            **extra_properties)

        if node_id == GraphData.UNASSIGNED_NODE_ID:
            print("fail to add parameter node %r" % (api_entity_json))

        return node_id

    def import_return_value_entity(self, api_entity_json):
        extra_properties = {}

        qualified_name = api_entity_json["qualified_name"]
        short_description = api_entity_json["short_description"]

        value_type = qualified_name.split(" ")[0].strip()
        ## todo: add all class node first, in case adding the parameter node without type info
        node_id = self.code_element_kg_builder.add_base_value_entity_node(
            value_type=value_type,
            value_name="<R>",
            short_description=short_description,
            entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE,
            **extra_properties)

        if node_id == GraphData.UNASSIGNED_NODE_ID:
            print("fail to add parameter node %r" % (api_entity_json))

        return node_id

    def import_exception_condition_entity(self, api_entity_json):
        extra_properties = {}

        qualified_name = api_entity_json["qualified_name"]
        short_description = api_entity_json["short_description"]

        value_type = qualified_name.split(" ")[0].strip()
        ## todo: add all class node first, in case adding the parameter node without type info
        node_id = self.code_element_kg_builder.add_base_value_entity_node(
            value_type=value_type,
            value_name="<E>",
            short_description=short_description,
            entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE,
            **extra_properties)

        if node_id == GraphData.UNASSIGNED_NODE_ID:
            print("fail to add parameter node %r" % (api_entity_json))

        return node_id

    def import_construct_method_entity(self, api_entity_json):

        format_qualified_name = self.code_element_kg_builder.format_qualified_name(
            api_entity_json["qualified_name"])

        method_name = self.code_element_kg_builder.parse_construct_to_javaparser_style(
            format_qualified_name)

        if not method_name:
            return GraphData.UNASSIGNED_NODE_ID

        api_entity_json.pop("qualified_name")
        node_id = self.code_element_kg_builder.add_normal_code_element_entity(
            method_name, api_entity_json["api_type"], **api_entity_json)

        return node_id

    def import_qualified_field_entity(self, api_entity_json):
        # print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json))

        qualified_name = self.code_element_kg_builder.format_qualified_name(
            api_entity_json["qualified_name"])
        if not qualified_name:
            print("import_qualified_field_entity %r %r" %
                  (api_entity_json["qualified_name"], api_entity_json))
            return GraphData.UNASSIGNED_NODE_ID

        api_entity_json.pop("qualified_name")
        api_entity_json.pop("api_type")

        node_id = self.code_element_kg_builder.add_normal_code_element_entity(
            qualified_name, CodeEntityCategory.CATEGORY_FIELD_OF_CLASS,
            **api_entity_json)

        return node_id

    def import_qualified_enum_constants_entity(self, api_entity_json):
        # print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json))

        qualified_name = self.code_element_kg_builder.format_qualified_name(
            api_entity_json["qualified_name"])
        if not qualified_name:
            print("import_qualified_field_entity %r %r" %
                  (api_entity_json["qualified_name"], api_entity_json))
            return GraphData.UNASSIGNED_NODE_ID

        api_entity_json.pop("qualified_name")
        api_entity_json.pop("api_type")

        node_id = self.code_element_kg_builder.add_normal_code_element_entity(
            qualified_name, CodeEntityCategory.CATEGORY_ENUM_CONSTANTS,
            **api_entity_json)

        return node_id

    def import_jdk_from_api_table(self, session):
        print("start import_jdk_from_api_table ")
        # api_entity_list = session.query(APIEntity).filter(APIEntity.id > 85000).limit(1000).all()
        api_entity_list = session.query(APIEntity).all()

        api_id_to_node_id_map = {}
        for entity_info_row in api_entity_list:

            api_entity_json = dict(entity_info_row.__dict__)
            api_entity_json.pop('_sa_instance_state', None)
            api_id = api_entity_json["id"]
            qualified_name = api_entity_json["qualified_name"]
            api_type = api_entity_json["api_type"]

            if self.is_jdk_api(qualified_name) == False:
                if self.is_android_support(qualified_name):
                    continue
                if self.is_android_core_api(qualified_name):
                    continue

                # if self.is_android_core_api(qualified_name) == False:
                print("Not jdk %d %s %r " %
                      (api_id, qualified_name,
                       CodeEntityCategory.to_str(api_type)))
                continue

            normal_entity_types = {
                CodeEntityCategory.CATEGORY_CLASS,
                CodeEntityCategory.CATEGORY_PACKAGE,
                CodeEntityCategory.CATEGORY_METHOD,
                CodeEntityCategory.CATEGORY_INTERFACE,
                CodeEntityCategory.CATEGORY_EXCEPTION_CLASS,
                CodeEntityCategory.CATEGORY_ENUM_CLASS,
                CodeEntityCategory.CATEGORY_ERROR_CLASS,
                CodeEntityCategory.CATEGORY_ANNOTATION_CLASS,
            }
            node_id = GraphData.UNASSIGNED_NODE_ID
            if api_type in normal_entity_types:
                node_id = self.import_normal_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD:
                node_id = self.import_construct_method_entity(api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_FIELD_OF_CLASS:
                node_id = self.import_qualified_field_entity(api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_ENUM_CONSTANTS:
                node_id = self.import_qualified_enum_constants_entity(
                    api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_PRIMARY_TYPE:
                node_id = self.add_primary_type(
                    primary_type_name=qualified_name, **api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_PARAMETER:
                node_id = self.import_parameter_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_RETURN_VALUE:
                node_id = self.import_return_value_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION:
                node_id = self.import_exception_condition_entity(
                    api_entity_json)
            if node_id == GraphData.UNASSIGNED_NODE_ID:
                print("Adding fail %d %s %r " %
                      (api_id, qualified_name,
                       CodeEntityCategory.to_str(api_type)))
                continue
            api_id_to_node_id_map[api_id] = node_id

        self.graph_data.print_graph_info()
        print("end import_jdk_from_api_table ")

        return api_id_to_node_id_map

    def import_android_from_api_table(self, session):
        print("start import android api from jdk table")
        # api_entity_list = session.query(APIEntity).filter(APIEntity.id > 85000).limit(1000).all()
        api_entity_list = session.query(APIEntity).all()

        api_id_to_node_id_map = {}
        for entity_info_row in api_entity_list:

            api_entity_json = dict(entity_info_row.__dict__)
            api_entity_json.pop('_sa_instance_state', None)
            api_id = api_entity_json["id"]
            qualified_name = api_entity_json["qualified_name"]
            api_type = api_entity_json["api_type"]

            if self.is_android_support(qualified_name):
                continue

            if self.is_jdk_api(
                    qualified_name) == False and self.is_android_core_api(
                        qualified_name) == False:
                # if self.is_android_core_api(qualified_name) == False:
                print("Not android or JDK API %d %s %r " %
                      (api_id, qualified_name,
                       CodeEntityCategory.to_str(api_type)))
                continue
            normal_entity_types = {
                CodeEntityCategory.CATEGORY_CLASS,
                CodeEntityCategory.CATEGORY_PACKAGE,
                CodeEntityCategory.CATEGORY_METHOD,
                CodeEntityCategory.CATEGORY_INTERFACE,
                CodeEntityCategory.CATEGORY_EXCEPTION_CLASS,
                CodeEntityCategory.CATEGORY_ENUM_CLASS,
                CodeEntityCategory.CATEGORY_ERROR_CLASS,
                CodeEntityCategory.CATEGORY_ANNOTATION_CLASS,
            }
            node_id = GraphData.UNASSIGNED_NODE_ID
            if api_type in normal_entity_types:
                node_id = self.import_normal_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD:
                node_id = self.import_construct_method_entity(api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_FIELD_OF_CLASS:
                node_id = self.import_qualified_field_entity(api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_ENUM_CONSTANTS:
                node_id = self.import_qualified_enum_constants_entity(
                    api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_PRIMARY_TYPE:
                node_id = self.add_primary_type(
                    primary_type_name=qualified_name, **api_entity_json)

            if api_type == CodeEntityCategory.CATEGORY_PARAMETER:
                node_id = self.import_parameter_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_RETURN_VALUE:
                node_id = self.import_return_value_entity(api_entity_json)
            if api_type == CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION:
                node_id = self.import_exception_condition_entity(
                    api_entity_json)
            if node_id == GraphData.UNASSIGNED_NODE_ID:
                print("Adding fail %d %s %r " %
                      (api_id, qualified_name,
                       CodeEntityCategory.to_str(api_type)))
                continue
            api_id_to_node_id_map[api_id] = node_id

        self.graph_data.print_graph_info()
        print("end import_jdk_from_api_table ")

        return api_id_to_node_id_map

    def import_relation_from_jdk_table(self, session, api_id_to_node_id_map):
        print("start import jdk relation")
        self.graph_data.print_graph_info()

        valid_api_types = CodeEntityRelationCategory.relation_set()

        for relation_type in valid_api_types:
            relation_str = CodeEntityRelationCategory.to_str(relation_type)
            print("start import relation %s" % (relation_str))
            api_relation_list = session.query(APIRelation).filter(
                APIRelation.relation_type == relation_type).all()
            for relation in api_relation_list:
                if relation.start_api_id not in api_id_to_node_id_map:
                    print("start_id %d can't found its node id" %
                          (relation.start_api_id))
                    continue
                if relation.end_api_id not in api_id_to_node_id_map:
                    print("end_id %d can't found its node id" %
                          (relation.end_api_id))
                    continue
                self.graph_data.add_relation(
                    startId=api_id_to_node_id_map[relation.start_api_id],
                    endId=api_id_to_node_id_map[relation.end_api_id],
                    relationType=relation_str)
        print("end import jdk relation")
        self.graph_data.print_graph_info()

    def is_jdk_api(self, qualified_name):
        if qualified_name.startswith("java."):
            return True
        if qualified_name.startswith("javax."):
            return True
        if qualified_name.startswith("org.w3c.dom"):
            return True
        if qualified_name.startswith("org.xml.sax"):
            return True
        if qualified_name.startswith("org.ietf"):
            return True
        if qualified_name.startswith("org.omg"):
            return True
        for primary in CodeEntityCategory.JAVA_PRIMARY_TYPE_SET:
            if qualified_name.startswith(primary):
                return True
        # the Generic type parameter,eg. T element, T[]
        if len(qualified_name.strip("[]").split(" ")[0]) == 1:
            return True

        return False

    def is_android_support(self, qualified_name):
        if qualified_name.startswith("androidx"):
            return True
        if qualified_name.startswith("android.support"):
            return True
        return False

    def is_android_core_api(self, qualified_name):

        if self.is_android_support(qualified_name):
            return False
        if qualified_name.startswith("android"):
            return True
        if qualified_name.startswith("com.android.internal.util"):
            return True
        if qualified_name.startswith("dalvik."):
            return True
        if qualified_name.startswith("junit."):
            return True
        if qualified_name.startswith("org.xmlpull"):
            return True
        if qualified_name.startswith("org.json"):
            return True
        if qualified_name.startswith("org.apache"):
            return True
        return False

    def add_source_label(self, source_label):
        self.code_element_kg_builder.add_source_label(source_label)
Пример #15
0
 def __init__(self):
     self.graph_data = GraphData()
     self.code_element_kg_builder = CodeElementGraphDataBuilder(
         self.graph_data)
Пример #16
0
 def build_doc(self, graph_data_path, output_doc_collection_path=None):
     graph_data_instance = GraphData.load(str(graph_data_path))
     builder = GraphNodeDocumentBuilder(graph_data=graph_data_instance)
     return builder.build_doc_for_kg(output_doc_collection_path)
Пример #17
0
                    "Description": api_sample_code["Description"]
                },
                primary_property_name="Description")
            # api_node_id = graph_data.find_one_node_by_property(property_name="qualified_name",
            #                                                    property_value=api_sample_code["API"])["properties"][
            #     "id"]
            # code_node_id = graph_data.find_one_node_by_property(property_name="Code", property_value=api_sample_code["Code"])["_node_id"]
            # description_node_id = graph_data.find_one_node_by_property(property_name="Description", property_value=api_sample_code["Description"])["_node_id"]
            graph_data.add_relation(startId=api_node_id,
                                    relationType="has sample code",
                                    endId=code_node_id)
            graph_data.add_relation(startId=code_node_id,
                                    relationType="has description",
                                    endId=description_node_id)
        else:
            print(api_sample_code["Id"])

    graph_data.save(output_graph_data_path)


if __name__ == "__main__":
    # graph_data_path = str(Path(GRAPH_DATA_DIR) / 'jdk8_sample_code.v1.graph')
    # kg_impoter(graph_data_path)
    graph_data = GraphData.load(
        str(Path(GRAPH_DATA_DIR) / 'jdk8_sample_code.v1.graph'))
    # # ids = (489,)
    # # results = graph_data.find_one_node_by_property(property_name="id", property_value=1869)
    # # print(results["id"])
    graph_data.print_graph_info()
    # create_sample_code_kg(graph_data)
Пример #18
0
class GenericKGFusion:
    INVALID_TEXTS = {
        "scientific article", "wikimedia template", "wikimedia list article",
        "wikipedia template", "wikibase wikis", "wikimedia", "wikibase",
        "wikidata"
    }
    INVALID_SUBCLASS_ITEM_ID = set([
        "Q11424",  # film
        "Q15138389",  # wiki
        "Q7187",  # gene
    ])

    DEFAULT_FILTER_CONTEXT_SCORE = 0.8
    DEFAULT_FILTER_TOPIC_SCORE = 0.9

    DEFAULT_ACCEPTABLE_TOPIC_SCORE = 0.95
    DEFAULT_ACCEPTABLE_CONTEXT_SCORE = 0.85

    DEFAULT_PROXY_SERVER = "http://127.0.0.1:1080"

    def __init__(self,
                 filter_score=DEFAULT_FILTER_CONTEXT_SCORE,
                 proxy_server=DEFAULT_PROXY_SERVER):
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        self.wikipedia_cache = {}
        self.fetcher = AsyncWikiSearcher(proxy_server)
        self.graph_data = GraphData()
        self.wikidata_property_table = WikiDataPropertyTable.get_instance()
        self.embedding = {}
        self.filter_score = filter_score
        self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline()
        self.all_domain_vector = {}

    def init_wd_from_cache(self, title_save_path=None, item_save_path=None):
        self.fetcher.init_from_cache(title_save_path=title_save_path,
                                     item_save_path=item_save_path)
        print("Init from cache...")

    def init_wikipedia_contex(self, wikipedia_context_path=None):
        # TODO 将wikipedia的内容加到wikisearcher这个类里,就不用在GenericKGFusion中load了
        if wikipedia_context_path is not None and Path(
                wikipedia_context_path).exists():
            with open(wikipedia_context_path, "rb") as f:
                self.wikipedia_cache = pickle.load(f)
        else:
            print('no such wikipedia_context_path {}'.format(
                wikipedia_context_path))

    def export_wd_cache(self, title_save_path, item_save_path):
        self.fetcher.save(item_save_path=item_save_path,
                          title_save_path=title_save_path)

    def load_word_embedding(self, emb_path):
        wv = KeyedVectors.load(emb_path)
        self.embedding = {k: wv[k] for k in wv.vocab.keys()}

    def load_w2v_model(self, w2v_path):
        self.w2v_model = AVGW2VFLModel.load(w2v_path)

    def init_graph_data(self, graph_data_path):
        self.graph_data = GraphData.load(graph_data_path)

    def fetch_wikidata_by_name(self,
                               terms,
                               title_save_path=None,
                               item_save_path=None):
        """
                search with some terms and find the candidate wikidata item list for the term,
                 and cache all the possible wikidata item for the item.
                 eg. for term: "apple", we will search it in wikidata.org by API and get the returned
                 search result list(maybe 10 result). the search result for keywords will be cached.
                 And we we retrieve all 10 candidate wikidata item info.

                :param item_save_path: the wikidata item info cache path
                :param title_save_path:  the search result by title saving path
                :param terms: a list of str or a set of str standing for concepts.
                :return:
                """
        self.fetcher.init_from_cache(title_save_path=title_save_path,
                                     item_save_path=item_save_path)
        terms = {self.lemmatizer.noun(term)[0].lower() for term in terms}
        print(
            "need to fetch %r term wiki titles, %r are already cache, actual %r need to fetch"
            % (len(terms), len(self.fetcher.title_cache.keys() & terms),
               len(terms) - len(self.fetcher.title_cache.keys() & terms)))

        term_titles = self.fetcher.search_title(terms)
        if title_save_path is not None:
            self.fetcher.save(title_save_path=title_save_path)

        ids = self.get_valid_wikidata_item(term_titles)
        term_wikiitems = self.fetch_wikidata_by_id(ids, item_save_path)
        return term_titles, term_wikiitems

    @staticmethod
    def is_need_to_fetch_wikidata_item(item):
        INVALID_TEXTS = [
            "scientific article", "wikimedia template",
            "wikimedia list article", "wikipedia template", "wikibase wikis",
            "wikimedia"
        ]

        snippet = item["snippet"].lower()
        for invalid_text in INVALID_TEXTS:
            if invalid_text in snippet:
                return False

        return True

    @staticmethod
    def get_valid_wikidata_item(term_titles):
        """
        some search results for wikidata are not need to search, for example, the item has "scientific article" in description.
        :param term_titles:
        :return:
        """
        valid_wikidata_ids = set([])

        for v in term_titles.values():
            for item in v:
                if GenericKGFusion.is_need_to_fetch_wikidata_item(
                        item) == False:
                    continue
                valid_wikidata_ids.add(item["title"])

        return valid_wikidata_ids

    def fetch_wikidata_by_id(self, ids, item_save_path=None):

        print(
            "need to fetch wikidata items num=%r, %r are already cache, actual %r need to fetch"
            % (len(ids), len(self.fetcher.item_cache.keys() & ids),
               len(ids) - len(self.fetcher.item_cache.keys() & ids)))

        term_wikiitems = self.fetcher.fetch_item(ids)
        if item_save_path is not None:
            self.fetcher.save(item_save_path=item_save_path)
        return term_wikiitems

    # def compute_topic_vector(self):
    #     topic_words = []
    #     for node_id in self.graph_data.get_node_ids_by_label(DomainConstant.LABEL_DOMAIN_TERM):
    #         try:
    #             node_json = self.graph_data.get_node_info_dict(node_id=node_id)
    #             if not node_json:
    #                 continue
    #             node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
    #             lemma = node_properties[PropertyConstant.LEMMA]
    #             aliases = node_properties.get(PropertyConstant.ALIAS, [])
    #             aliases_en = node_properties.get("aliases_en", [])
    #             description_en = node_properties.get("descriptions_en", "")
    #             name = node_properties.get("name", "")
    #             topic_words.append(lemma)
    #             topic_words.extend(aliases)
    #             topic_words.extend(aliases_en)
    #             topic_words.append(description_en)
    #             topic_words.append(name)
    #         except:
    #             traceback.print_exc()
    #     topic_text = " ".join(topic_words).lower()
    #
    #     if len(topic_text) == 0:
    #         return None
    #     words = [w for w in topic_text.split() if w]
    #     if len(words) == 0:
    #         return None
    #     vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words)
    #
    #     return vec_des
    #
    # def compute_wikidata_vector(self, wikidata_item, term_wikiitems, node_json):
    #     relation_text = self.generate_relations_text(wikidata_item, term_wikiitems)
    #     description = wikidata_item.get_en_description()
    #     en_name = wikidata_item.get_en_name()
    #     en_aliases = wikidata_item.get_en_aliases()
    #
    #     description = " ".join([en_name, " ".join(en_aliases), description, relation_text])
    #
    #     words = [token.lemma_.lower() for token in self.NLP(description) if
    #              token.is_digit == False and token.is_stop == False and token.is_punct == False]
    #
    #     domain_term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.LEMMA]
    #
    #     removal_words = set(domain_term_name.lower().split())
    #     words = [w for w in words if w not in removal_words]
    #
    #     if len(words) == 0:
    #         return None
    #     # todo: the size of vector should be adjust
    #     vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words)
    #
    #     return vec_des
    #
    # def __score_topic(self, topic_vector, wikidata_item, term_wikiitems, node_json):
    #
    #     wikidata_vector = self.compute_wikidata_vector(wikidata_item, term_wikiitems, node_json)
    #     return self.compute_sim_for_two_vectors(wikidata_vector, topic_vector)
    #
    # def __score_context(self, node_json, wikidata_item, term_wikiitems):
    #     relation_text = self.generate_relations_text(wikidata_item, term_wikiitems)
    #     description = wikidata_item.get_en_description()
    #     en_name = wikidata_item.get_en_name()
    #     en_aliases = wikidata_item.get_en_aliases()
    #
    #     description = " ".join([en_name, " ".join(en_aliases), description, relation_text])
    #
    #     domain_term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.LEMMA]
    #
    #     name = self.get_compare_name_for_domain_term(node_json)
    #
    #     removal_words = set(domain_term_name.lower().split())
    #
    #     if len(description) == 0 or len(name) == 0:
    #         return 0
    #     # words = list(set(
    #     #     [token.lemma_.lower() for token in self.NLP(description) if
    #     #      token.is_digit == False and token.is_stop == False]))
    #     words = [token.lemma_.lower() for token in self.NLP(description) if
    #              token.is_digit == False and token.is_stop == False and token.is_punct == False]
    #     words = [w for w in words if w not in removal_words]
    #
    #     if len(words) == 0:
    #         return 0
    #     vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words)
    #     # name_words = list(
    #     #     set([token.lemma_.lower() for token in self.NLP(name) if
    #     #          token.is_digit == False and token.is_stop == False]))
    #     name_words = [token.lemma_.lower() for token in self.NLP(name) if
    #                   token.is_digit == False and token.is_stop == False]
    #
    #     if len(name_words) == 0:
    #         return 0
    #     vec_term = sum([self.embedding.get(w, np.zeros([100])) for w in name_words]) / len(name_words)
    #
    #     return self.compute_sim_for_two_vectors(vec_des, vec_term)
    #
    # def compute_sim_for_two_vectors(self, vec_des, vec_term):
    #     norm_des = np.linalg.norm(vec_des)
    #     norm_term = np.linalg.norm(vec_term)
    #     if norm_des == 0 or norm_term == 0:
    #         return 0
    #     return 0.5 + vec_des.dot(vec_term) / (norm_des * norm_term) / 2
    #
    # def get_compare_name_for_domain_term(self, node_json):
    #     domain_term_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]
    #
    #     name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.LEMMA, "")
    #     aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, [])
    #     aliases_en = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get("aliases_en", [])
    #
    #     other_names = [name]
    #     other_names.extend(aliases)
    #     other_names.extend(aliases_en)
    #
    #     out_relations = self.graph_data.get_all_out_relations(node_id=domain_term_id)
    #     in_relations = self.graph_data.get_all_in_relations(node_id=domain_term_id)
    #     domain_term_node_ids = self.graph_data.label_to_ids_map[DomainConstant.LABEL_DOMAIN_TERM]
    #     id_set = set([])
    #     for (start_id, r, end_id) in out_relations:
    #         if end_id in domain_term_node_ids:
    #             id_set.add(end_id)
    #     for (start_id, r, end_id) in in_relations:
    #         if start_id in domain_term_node_ids:
    #             id_set.add(start_id)
    #     id_set.add(domain_term_id)
    #     for id in id_set:
    #         temp_node_json = self.graph_data.get_node_info_dict(node_id=id)
    #         other_names.append(temp_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.LEMMA, ""))
    #     name = " ".join(other_names)
    #     return name

    def add_wikidata_item(self, item: WikiDataItem):
        """
        在图中添加一个wikidata的节点,没有添加relation
        add a new term to graph data
        :param term: the term added to GraphData
        :return: the node_id fo the added term node
        """
        ori_node_json = self.graph_data.find_one_node_by_property(
            WikiDataConstance.PRIMARY_PROPERTY_NAME, item.wd_item_id)
        if ori_node_json:
            # print(ori_node_json)
            # print('no new wiki node!! node %d has fused wiki_node %s' % (ori_node_json["id"], item.wd_item_id))
            return ori_node_json["id"]
        # print("add new wikinode %s" % (item.wd_item_id))
        node_labels = [WikiDataConstance.LABEL_WIKIDATA]
        node_properties = {
            WikiDataConstance.PRIMARY_PROPERTY_NAME: item.wd_item_id,
            WikiDataConstance.NAME: item.get_en_name(),
            PropertyConstant.ALIAS: set(item.get_en_aliases()),
        }
        item.get_relation_property_name_list()
        relation_property_set = set(item.relation_property_name_list)
        pure_property_set = set(item.get_non_relation_property_name_list())

        valid_property_dict = {}
        for p, v in item.data_dict.items():
            if p in relation_property_set:
                continue
            if p in pure_property_set:
                p = self.wikidata_property_table.property_id_2_name(p)
                if p == None:
                    continue
            valid_property_dict[p] = v
        wikidata_node_id = self.graph_data.add_node(
            node_labels=node_labels,
            node_properties=dict(valid_property_dict, **node_properties),
            primary_property_name=WikiDataConstance.PRIMARY_PROPERTY_NAME)
        return wikidata_node_id

    def add_all_wiki_nodes(self):
        print("start add all wiki nodes.......")
        self.graph_data.create_index_on_property(
            WikiDataConstance.PRIMARY_PROPERTY_NAME)
        term_wikiitems = self.fetcher.item_cache
        wikiiterms_ids = term_wikiitems.keys()
        self.add_wikidata_items(wikiiterms_ids)
        self.graph_data.refresh_indexer()

    def simple_fuse(self, ):
        """
        simple fuse wiki data, the graph is with all wikidata nodes, we need to calculate similarity to filter some
        :return:
        """
        record = []
        valid_domain_id_set = self.graph_data.get_node_ids_by_label(
            DomainConstant.LABEL_DOMAIN_TERM)
        i = 0
        valid_wiki_id_set = self.graph_data.get_node_ids_by_label("wikidata")
        valid_wiki_index = np.array(
            list(
                self.w2v_model.preprocess_doc_collection.
                doc_id_set_2_doc_index_set(valid_wiki_id_set)))
        print("valid_wiki_index size: ", valid_wiki_index.size)

        doc_model = self.w2v_model.avg_w2v_model_field_map["doc"]

        for node_id in valid_domain_id_set:
            try:
                node_json = self.graph_data.get_node_info_dict(node_id=node_id)
                if not node_json:
                    continue
                node_properties = node_json[
                    GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                lemma = node_properties[PropertyConstant.LEMMA]
                alias_set = node_properties[PropertyConstant.ALIAS]
                term_name = node_properties["term_name"]
                alias_set.add(lemma)
                alias_set.add(term_name)
                text = " ".join(list(alias_set))
                domain_words = self.w2v_model.preprocessor.clean(text)
                domain_vec = self.w2v_model.get_avg_w2v_vec(domain_words)
                score_vector = (
                    doc_model.similar_by_vector(domain_vec, topn=None) + 1) / 2

                sort_index = np.argsort(-score_vector)
                score_vector = score_vector[sort_index]

                over_thred = np.where(score_vector > 0.8)
                top_wiki_valid = np.intersect1d(over_thred, valid_wiki_index)

                if top_wiki_valid.size:
                    print("number {}:{} ,Done!".format(i, node_id))
                # score_vector = score_vector[top_wiki_valid]

                sorted_index_scores = np.array(
                    (top_wiki_valid, score_vector[top_wiki_valid])).T
                retrieval_results = []
                rank = 0
                for (doc_index, score) in sorted_index_scores:
                    entity_document = self.w2v_model.doc_index2doc(doc_index)
                    if rank >= 5:
                        break
                    if entity_document is None:
                        continue
                    wiki_id = entity_document.get_document_id()
                    rank += 1
                    retrieval_results.append((wiki_id, score))

                for wiki_id, score in retrieval_results:
                    wiki_node_json = self.graph_data.get_node_info_dict(
                        wiki_id)
                    record.append({
                        "name":
                        wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                        ["wikidata_name"],
                        "alias":
                        wiki_node_json[
                            GraphData.DEFAULT_KEY_NODE_PROPERTIES]["alias_en"],
                        "description":
                        wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                        ["description_en"],
                        "domain term":
                        node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                        ["qualified_name"],
                        "score":
                        score,
                        "link":
                        True,
                        "domain_id":
                        node_id,
                        "wd_item_id":
                        wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                        ["wd_item_id"]
                    })
                    self.graph_data.add_relation(startId=node_id,
                                                 endId=wiki_id,
                                                 relationType="related to")
            except Exception:
                traceback.print_exc()
        self.delete_isolated_nodes_by_label(WikiDataConstance.LABEL_WIKIDATA)
        self.graph_data.refresh_indexer()
        return record

    @staticmethod
    def get_wikidata_item_ids_by_relation(wikidata_item: WikiDataItem, r):
        id_set = set([])
        end = wikidata_item.data_dict.get(r, [])
        if type(end) == list:
            for e in end:
                id_set.add(e)
        else:
            id_set.add(end)
        return id_set

    def generate_relations_text(self, wikidata_item, term_wikiitems):
        text = []
        for r in wikidata_item.relation_property_name_list:

            relation_name = self.wikidata_property_table.property_id_2_name(r)
            if relation_name == None:
                relation_name = r
            end = wikidata_item.data_dict[r]

            if type(end) == list:
                for e_wd_item_id in end:
                    if self.is_valid_wikidata_item_id(e_wd_item_id):
                        neibour_item = term_wikiitems.get(e_wd_item_id, None)
                        if neibour_item != None:
                            text.append(neibour_item.get_en_name())
                            # if relation_name in {"subclass of", "instance of", "part of"}:
                            #     text.append(neibour_item.get_en_description())
                            text.append(neibour_item.get_en_description())

                    else:
                        text.append(end)
                text.append(relation_name)

            else:
                if self.is_valid_wikidata_item_id(end):
                    neibour_item = term_wikiitems.get(end, None)
                    if neibour_item != None:
                        text.append(neibour_item.get_en_name())
                        # if relation_name in {"subclass of", "instance of", "part of"}:
                        #     text.append(neibour_item.get_en_description())
                        text.append(neibour_item.get_en_description())
                else:
                    text.append(end)
                text.append(relation_name)

        return " ".join(text)

    def is_valid_wikidata_item_id(self, wd_item_id):
        try:

            if wd_item_id.startswith("Q") and wd_item_id[1:].isdigit():
                return True
            return False
        except:
            return False

    def get_all_neighbours_id(self, item):
        neighbours = set()
        for r in item.relation_property_name_list:
            end = item.data_dict[r]
            if type(end) == list:
                for e in end:
                    if e[0] == "Q" or e[0] == "P":
                        neighbours.add(e)
            else:
                if end[0] == "Q" or end[0] == "P":
                    neighbours.add(end)

        return neighbours

    def get_all_neighbours_id_by_item_id(self, item_id):
        neighbours = set()
        item = self.fetcher.item_cache.get(item_id, None)
        if item == None:
            return set()
        neighbours = self.get_all_neighbours_id(item)
        return neighbours

    def fetch_valid_wikidata_item_neibours_from_all_term_titles(
            self, item_save_path):
        """
        some search results for wikidata are not need to search, for example, the item has "scientific article" in description.
        :param term_titles:
        :return:
        """
        term_titles = self.fetcher.title_cache
        valid_wikidata_ids = GenericKGFusion.get_valid_wikidata_item(
            term_titles)
        nerbours = set([])

        for valid_id in valid_wikidata_ids:
            nerbours.update(self.get_all_neighbours_id_by_item_id(valid_id))
        return self.fetch_wikidata_by_id(nerbours, item_save_path)

    def add_wikidata_items(self, wd_item_ids):
        term_wikiitems = self.fetcher.item_cache
        self.graph_data.refresh_indexer()
        i = 0
        for wd_item_id in wd_item_ids:
            i += 1
            print(i, ": ", wd_item_id)
            self.add_wikidata_item(term_wikiitems[wd_item_id])
        self.build_relation_between_wikidata_node_in_graph(term_wikiitems)

    def build_relation_between_wikidata_node_in_graph(self, term_wikiitems):
        wikidata_node_ids = self.graph_data.get_node_ids_by_label(
            WikiDataConstance.LABEL_WIKIDATA)
        wd_item_id_2_node_id_map = {}
        node_id_2_wd_item_id_map = {}
        for node_id in wikidata_node_ids:
            wikidata_node = self.graph_data.get_node_info_dict(node_id)
            wd_item_id = wikidata_node[GraphData.DEFAULT_KEY_NODE_PROPERTIES][
                WikiDataConstance.PRIMARY_PROPERTY_NAME]
            wd_item_id_2_node_id_map[wd_item_id] = node_id
            node_id_2_wd_item_id_map[node_id] = wd_item_id
        for start_wd_item_id, start_node_id in wd_item_id_2_node_id_map.items(
        ):
            start_wikidata_item = term_wikiitems.get(start_wd_item_id, None)
            if start_wikidata_item == None:
                continue
            for r_id in start_wikidata_item.relation_property_name_list:
                end_wd_ids = self.get_wikidata_item_ids_by_relation(
                    start_wikidata_item, r_id)
                relation_name = self.wikidata_property_table.property_id_2_name(
                    r_id)
                if relation_name == None:
                    continue

                for end_wd_id in end_wd_ids:
                    end_node_id = wd_item_id_2_node_id_map.get(end_wd_id, None)
                    if end_node_id == None:
                        continue
                    if start_node_id == end_node_id:
                        continue
                    self.graph_data.add_relation(start_node_id, relation_name,
                                                 end_node_id)

    def save(self, graph_data_path):
        self.graph_data.save(graph_data_path)
        print("save ", type(self.graph_data))

    def is_valid_wikidata_item(self, item):
        for text in self.INVALID_TEXTS:
            en_name = item.get_en_name().lower()
            if text in en_name:
                return False

        end_wd_ids = self.get_wikidata_item_ids_by_relation(item, "P31")

        for end_wd in end_wd_ids:
            if end_wd in self.INVALID_SUBCLASS_ITEM_ID:
                return False

        return True

    def fetch_wikidata_by_name_and_cache_neibours(self, terms, title_save_path,
                                                  item_save_path):
        self.fetch_wikidata_by_name(terms,
                                    item_save_path=item_save_path,
                                    title_save_path=title_save_path)
        self.fetch_valid_wikidata_item_neibours_from_all_term_titles(
            item_save_path=item_save_path)

    def delete_isolated_nodes_by_label(self, label):
        label_ids = self.graph_data.get_node_ids_by_label(label)
        remove_id = set()
        for id in label_ids:
            in_relations = self.graph_data.get_all_in_relations(id)
            out_relations = self.graph_data.get_all_out_relations(id)
            if not in_relations and not out_relations:
                remove_id.add(id)
                print("remove {}: {}".format(label, id))
        for id in remove_id:
            self.graph_data.remove_node(id)
        print("remove {} wiki nodes".format(len(remove_id)))
Пример #19
0
 def init_graph_data(self, graph_data_path):
     self.graph_data = GraphData.load(graph_data_path)
Пример #20
0
class SkeletonKGBuilder:
    """
    build the skeleton KG from the JavaParser analysis result for the Project Source Code.
    It will include the package, class, interface, method.
    """

    def __init__(self):
        self.graph_data = GraphData()
        self.code_element_kg_builder = CodeElementGraphDataBuilder(self.graph_data)

    def init_graph_data(self, graph_data_path):
        self.graph_data = GraphData.load(graph_data_path)
        self.code_element_kg_builder = CodeElementGraphDataBuilder(self.graph_data)

    def import_primary_type(self):

        type_list = CodeEntityCategory.java_primary_types()

        for item in type_list:
            code_element = {
                "qualified_name": item["name"],
                "api_type": CodeEntityCategory.CATEGORY_PRIMARY_TYPE,
                "short_description": item["description"]
            }
            cate_labels = CodeEntityCategory.to_str_list(code_element["api_type"])

            builder = NodeBuilder()
            builder = builder.add_property(**code_element).add_entity_label().add_labels("code_element", *cate_labels)

            self.graph_data.add_node(
                node_id=GraphData.UNASSIGNED_NODE_ID,
                node_labels=builder.get_labels(),
                node_properties=builder.get_properties(),
                primary_property_name="qualified_name")

        self.graph_data.print_graph_info()

    def import_normal_entity_json(self, entity_json_path):
        print("start import normal entity json")
        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
        record_num = len(code_list)
        print("load json complete size=%d" % record_num)

        fail_num = 0
        name_mark = set([])
        for index, code_element in enumerate(code_list):
            format_qualified_name = self.code_element_kg_builder.format_qualified_name(code_element["qualified_name"])
            if not format_qualified_name:
                print("not __valid name %r" % code_element["qualified_name"])
                fail_num += 1
                continue
            code_element["qualified_name"] = format_qualified_name
            if code_element["qualified_name"] in name_mark:
                continue
            name_mark.add(code_element["qualified_name"])

            code_element.pop("qualified_name")

            node_id = self.code_element_kg_builder.add_normal_code_element_entity(format_qualified_name,
                                                                                  code_element["type"], **code_element)

        print("total=%d fail_num=%d success_num=%d" % (record_num, fail_num, record_num - fail_num))
        self.graph_data.print_graph_info()

        print("end import normal entity json")

    def import_normal_entity_relation_json(self, entity_relation_json_path):
        print("start import normal entity relations json")
        print(self.graph_data)
        self.graph_data.print_label_count()

        with open(entity_relation_json_path, "r", encoding='UTF-8') as f:
            code_relation_list = json.load(f)
            record_num = len(code_relation_list)
            print("load json complete size=%d" % record_num)

        fail_num = 0
        for relation_json in code_relation_list:
            relation_type = relation_json["relation_type"]

            if relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_METHOD_IMPLEMENT_CODE_CALL_METHOD:
                success = self.code_element_kg_builder.add_method_call_relation(relation_json["start_name"],
                                                                                relation_json["end_name"])
                if success == False:
                    fail_num = fail_num + 1
                continue
            if relation_type == relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_METHOD_IMPLEMENT_CODE_USE_CLASS:
                continue

            if relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_BELONG_TO or relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_EXTENDS or relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_IMPLEMENTS:
                success = self.code_element_kg_builder.add_relation_by_creating_not_exist_entity(
                    relation_json["start_name"],
                    relation_json["end_name"],
                    relation_type=relation_type

                )
                if success == False:
                    fail_num = fail_num + 1
                continue

            success = self.code_element_kg_builder.add_relation_by_not_creating_entity(relation_json["start_name"],
                                                                                       relation_json["end_name"],
                                                                                       relation_type)

            if success == False:
                fail_num = fail_num + 1

        print("fail num=%d" % fail_num)
        self.graph_data.print_graph_info()

        print("end import normal entity relations json")

    def import_field_entity(self, entity_json_path, entity_relation_json_path):

        print("start import field entity json")
        print(self.graph_data)
        self.graph_data.print_label_count()

        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
            record_num = len(code_list)
        print("load json complete size=%d" % record_num)

        with open(entity_relation_json_path, "r", encoding='UTF-8') as f:
            relation_list = json.load(f)
            relation_num = len(relation_list)
            print("load json complete entity relation size=%d" % relation_num)
        old_id_to_new_node_id_map = {}

        for index, code_element in enumerate(code_list):
            field_id = code_element["id"]
            field_type = code_element["field_type"]
            field_name = code_element["field_name"]
            # short_description = code_element["description"]
            short_description = ""  # the field json has not description

            new_field_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=field_type,
                                                                                        value_name=field_name,
                                                                                        short_description=short_description,
                                                                                        entity_category=CodeEntityCategory.CATEGORY_FIELD)

            old_id_to_new_node_id_map[field_id] = new_field_node_id

        for r in relation_list:
            field_node_id = old_id_to_new_node_id_map[r["field_id"]]
            class_qualified_name = self.code_element_kg_builder.format_qualified_name(r["belong_class_interface_name"])
            node_json = self.graph_data.find_one_node_by_property("qualified_name", class_qualified_name)
            if node_json is None:
                parent_node_id = self.code_element_kg_builder.add_type_node(class_qualified_name)

            else:
                parent_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]

            if self.graph_data.exist_relation(parent_node_id,
                                              CodeEntityRelationCategory.to_str(
                                                  CodeEntityRelationCategory.RELATION_CATEGORY_HAS_FIELD),
                                              field_node_id):
                print("------")
                print(r, field_node_id, node_json)

            self.graph_data.add_relation(parent_node_id,
                                         CodeEntityRelationCategory.to_str(
                                             CodeEntityRelationCategory.RELATION_CATEGORY_HAS_FIELD),
                                         field_node_id)

        self.graph_data.print_graph_info()

        print("end import field entity json")

    def import_parameter_entity(self, entity_json_path, entity_relation_json_path):
        print("start import parameter entity")
        self.graph_data.print_graph_info()

        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
            record_num = len(code_list)
            print("load json complete entity size=%d" % record_num)

        with open(entity_relation_json_path, "r", encoding='UTF-8') as f:
            relation_list = json.load(f)
            record_num = len(relation_list)
            print("load json complete entity relation size=%d" % record_num)

        old_id_to_new_node_id_map = {}

        for index, code_element in enumerate(code_list):
            parameter_id = code_element["id"]
            parameter_type = code_element["parameter_type"]
            parameter_name = code_element["parameter_name"]

            short_description = code_element["description"]
            parameter_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=parameter_type,
                                                                                        value_name=parameter_name,
                                                                                        short_description=short_description,
                                                                                        entity_category=CodeEntityCategory.CATEGORY_PARAMETER)

            old_id_to_new_node_id_map[parameter_id] = parameter_node_id

        for r in relation_list:
            parameter_node_id = old_id_to_new_node_id_map[r["parameter_id"]]
            method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_name"])

            if not method_qualified_name:
                print("not __valid method name %r" % method_qualified_name)

                continue
            node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name)
            if not node_json:
                print("can't find %r, creating" % method_qualified_name)
                method_node_id = self.code_element_kg_builder.add_method_node(
                    method_qualified_name=method_qualified_name)

            else:
                method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]

            self.graph_data.add_relation(method_node_id,
                                         CodeEntityRelationCategory.to_str(
                                             CodeEntityRelationCategory.RELATION_CATEGORY_HAS_PARAMETER),
                                         parameter_node_id)

        print("end import parameter entity json")
        self.graph_data.print_graph_info()

    def import_method_local_variable_entity(self, entity_json_path):
        self.graph_data.print_graph_info()
        print("start import method local variable entity")
        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
            record_num = len(code_list)
            print("load json complete entity size=%d" % record_num)

        for index, variable_infos in enumerate(code_list):

            method_qualified_name = variable_infos["method_name"]
            method_qualified_name = self.code_element_kg_builder.format_qualified_name(method_qualified_name)
            if not method_qualified_name:
                print("not __valid method name %r" % method_qualified_name)

                continue
            node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name)
            if not node_json:
                print("can't find %r, creating" % method_qualified_name)
                method_node_id = self.code_element_kg_builder.add_method_node(
                    method_qualified_name=method_qualified_name)

            else:
                method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]

            for variable in variable_infos["variable_model_list"]:
                variable_type = variable["type"]
                variable_name = variable["name"]
                variable_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=variable_type,
                                                                                           value_name=variable_name,
                                                                                           short_description="",
                                                                                           entity_category=CodeEntityCategory.CATEGORY_LOCAL_VARIABLE)

                if variable_node_id == GraphData.UNASSIGNED_NODE_ID:
                    print("add variable node fail for %r" % variable)
                    continue
                self.graph_data.add_relation(method_node_id,
                                             CodeEntityRelationCategory.to_str(
                                                 CodeEntityRelationCategory.RELATION_CATEGORY_USE_LOCAL_VARIABLE),
                                             variable_node_id)

        print("end import local variable entity json")
        self.graph_data.print_graph_info()

    def import_return_value_entity(self, entity_json_path, entity_relation_json_path):
        print("start import return value entity")
        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
            record_num = len(code_list)
        print("load json complete size=%d" % record_num)

        with open(entity_relation_json_path, "r", encoding='UTF-8') as f:
            relation_list = json.load(f)
            record_num = len(relation_list)
            print("load json complete entity relation size=%d" % record_num)

        old_id_to_new_node_id_map = {}
        for index, code_element in enumerate(code_list):
            return_value_id = code_element["id"]
            return_value_type = code_element["return_value_type"]
            return_value_name = "<R>"
            short_description = code_element["description"]

            return_value_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=return_value_type,
                                                                                           value_name=return_value_name,
                                                                                           short_description=short_description,
                                                                                           entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE)
            old_id_to_new_node_id_map[return_value_id] = return_value_node_id

        for r in relation_list:
            return_value_node_id = old_id_to_new_node_id_map[r["type_return_id"]]
            method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_qualified_name"])

            if not method_qualified_name:
                print("not __valid method name %r" % method_qualified_name)
                continue
            node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name)

            if not node_json:
                print("can't find %r, creating" % method_qualified_name)
                method_node_id = self.code_element_kg_builder.add_method_node(
                    method_qualified_name=method_qualified_name)

            else:
                method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]

            self.graph_data.add_relation(method_node_id,
                                         CodeEntityRelationCategory.to_str(
                                             CodeEntityRelationCategory.RELATION_CATEGORY_HAS_RETURN_VALUE),
                                         return_value_node_id)

        self.graph_data.print_graph_info()

        print("end import return value entity json")

    def import_thrown_exceptions(self, entity_json_path, entity_relation_json_path):
        print("start import thrown exceptions entity")
        print(self.graph_data)
        self.graph_data.print_label_count()

        with open(entity_json_path, "r", encoding='UTF-8') as f:
            code_list = json.load(f)
            record_num = len(code_list)
            print("load json complete size=%d" % record_num)

        with open(entity_relation_json_path, "r", encoding='UTF-8') as f:
            relation_list = json.load(f)
            record_num = len(relation_list)
            print("load json complete entity relation size=%d" % record_num)

        old_id_to_new_node_id_map = {}

        for index, code_element in enumerate(code_list):
            thrown_exception_id = code_element["id"]
            exception_type = code_element["exception_type"]
            exception_name = "<E>"
            short_description = code_element["description"]

            exception_condition_node_id = self.code_element_kg_builder.add_base_value_entity_node(
                value_type=exception_type,
                value_name=exception_name,
                short_description=short_description,
                entity_category=CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION)

            old_id_to_new_node_id_map[thrown_exception_id] = exception_condition_node_id

        for r in relation_list:
            exception_condition_node_id = old_id_to_new_node_id_map[r["code_exception_id"]]
            method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_qualified_name"])

            if not method_qualified_name:
                print("not __valid method name %r" % method_qualified_name)
                continue
            node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name)

            if not node_json:
                print("can't find %r, creating" % method_qualified_name)
                method_node_id = self.code_element_kg_builder.add_method_node(
                    method_qualified_name=method_qualified_name)

            else:
                method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]

            self.graph_data.add_relation(method_node_id,
                                         CodeEntityRelationCategory.to_str(
                                             CodeEntityRelationCategory.RELATION_CATEGORY_HAS_EXCEPTION_CONDITION),
                                         exception_condition_node_id)

        self.graph_data.print_graph_info()

        print("end import thrown exceptions entity json")

    def infer_extra_relation(self):
        self.code_element_kg_builder.build_belong_to_relation()
        self.code_element_kg_builder.build_abstract_overloading_relation()
        # self.code_element_kg_builder.build_value_subclass_relation()
        self.code_element_kg_builder.build_belong_to_relation()
        self.code_element_kg_builder.build_override_relation()

    def add_source_label(self, source_label):
        self.code_element_kg_builder.add_source_label(source_label)

    def build_aliases(self):
        self.code_element_kg_builder.build_aliases_for_code_element()

    def save(self, graph_data_path):
        self.graph_data.save(graph_data_path)

    def save_as_simple_graph(self, output_path):
        graph_data = copy.deepcopy(self.graph_data)
        for node_id in graph_data.get_node_ids():
            node_json = graph_data.get_node_info_dict(node_id=node_id)

            properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
            if "code" in properties:
                properties.pop("code")
        graph_data.save(output_path)

    def build_method_code_use_constant_field_relation(self):
        collection = self.export_code_document_collection()
        self.code_element_kg_builder.build_use_jdk_constant_field_relation_from_code_doc(collection)

    def export_code_document_collection(self, code_doc_collection_path=None):
        collection = self.code_element_kg_builder.export_code_document_collection(code_doc_collection_path)
        return collection
Пример #21
0
 def __init__(self):
     self.graph_data = GraphData()
     self.text_extractor = EntityExtractor()
     self.detector = RelationDetector()
     self.identifier_info_extractor = IdentifierInfoExtractor()
Пример #22
0
from definitions import OUTPUT_DIR
from pathlib import Path

pro_name = 'jabref'
dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1')
graph_data_file_location = PathUtil.graph_data(pro_name=pro_name,
                                               version='v1.8')
dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1')
comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json"
qualified_name_json_file = Path(
    OUTPUT_DIR) / "json" / "mid_2_qualified_name.json"

if __name__ == '__main__':
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        dc_file_location)
    graph_data: GraphData = GraphData.load(graph_data_file_location)

    comment_list = []
    comments = open(comment_json_file, 'r').readlines()
    for line in comments:
        comment_list.append(json.loads(line))

    qualified_name_list = []
    names = open(qualified_name_json_file, 'r').readlines()
    for line in names:
        qualified_name_list.append(json.loads(line))

    missing_count = 0
    # 根据qualified name找到graph data对应节点的api_id, 然后通过这个api_id找到doc_collection中对应的doc, 插入field和相应信息
    for item in qualified_name_list:
        qualified_name = item['qname']
Пример #23
0
 def __init__(self, pro_name, version, model_dir):
     graph_data_path = PathUtil.graph_data(pro_name=pro_name,
                                           version=version)
     self.graph_data: GraphData = GraphData.load(graph_data_path)
     self.model = self.create_search_model(pro_name, version, model_dir)
     print("It's ok for init!")
Пример #24
0
class DomainKGFusion:
    """
    build the skeleton KG from the JavaParser analysis result for the Project Source Code.
    It will include the package, class, interface, method.
    """

    STOPLIST = set(stopwords.words('english'))

    METHOD_LABELS = {
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_METHOD),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_BASE_OVERRIDE_METHOD),
    }

    CLASS_LABELS = {
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_CLASS),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_PACKAGE),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_INTERFACE),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_ENUM_CONSTANTS),
    }

    VARIABLE_LABELS = {
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_FIELD),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_LOCAL_VARIABLE),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_PARAMETER),
        CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_FIELD_OF_CLASS),
    }

    def __init__(self):
        self.graph_data = GraphData()
        self.text_extractor = EntityExtractor()
        self.detector = RelationDetector()
        self.identifier_info_extractor = IdentifierInfoExtractor()

    def init_graph_data(self, graph_data_path):
        self.graph_data = GraphData.load(graph_data_path)

    def add_code_relation(self, start_node_id, relation_name, code_element):
        name = code_element.split("<")[0].split(".")[-1].split(" ")[-1]
        if len(name) == 0:
            return
        node_json = self.graph_data.find_one_node_by_property(PropertyConstant.ALIAS, name)
        if node_json is None:
            return
        end_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID]
        self.graph_data.add_relation(startId=start_node_id,
                                     relationType=relation_name,
                                     endId=end_node_id)

    def handle_comment_in_class(self, node_id, node_properties):
        terms = set()
        linkages = set()

        comment = node_properties.get(PropertyConstant.COMMENT, "")
        domain_terms, code_elements = self.text_extractor.extract_from_comment(comment)
        for term in domain_terms:
            terms.add(term)
            linkages.add((node_id, RelationType.MENTION_IN_COMENT.value, term))

        return terms, linkages

    def handle_text_in_method(self, node_id, node_properties):
        terms = set()
        linkages = set()

        comment = node_properties.get(PropertyConstant.COMMENT, "")
        domain_terms, code_elements = self.text_extractor.extract_from_comment(comment)
        for term in domain_terms:
            terms.add(term)
            linkages.add((node_id, RelationType.MENTION_IN_COMENT.value, term))
        for element in code_elements:
            self.add_code_relation(node_id, RelationType.MENTION_IN_COMENT.value, element)

        for inside_comment in node_properties.get(PropertyConstant.INSIDE_COMMENT, []):
            domain_terms, code_elements = self.text_extractor.extract_from_sentence(inside_comment)
            for term in domain_terms:
                terms.add(term)
                linkages.add((node_id, RelationType.MENTION_IN_INSIDE_COMENT.value, term))
            for element in code_elements:
                self.add_code_relation(node_id, RelationType.MENTION_IN_INSIDE_COMENT.value, element)

        for literal_expr in node_properties.get(PropertyConstant.STRING_LITERAL_EXPR, []):

            domain_terms, code_elements = self.text_extractor.extract_from_comment(literal_expr)
            for term in domain_terms:
                terms.add(term)
                linkages.add((node_id, RelationType.MENTION_IN_STRING_LITERAL.value, term))
            for element in code_elements:
                self.add_code_relation(node_id, RelationType.MENTION_IN_STRING_LITERAL.value, element)

        return terms, linkages

    def handle_description(self, node_id, description):
        terms = set()
        linkages = set()

        domain_terms, code_elements = self.text_extractor.extract_from_sentence(description)
        for term in domain_terms:
            linkages.add((node_id, RelationType.MENTION_IN_SHORT_DESCRIPTION.value, term))
        for element in code_elements:
            self.add_code_relation(node_id, RelationType.MENTION_IN_SHORT_DESCRIPTION.value, element)
        return terms, linkages

    def handle_method_name(self, node_id, name):

        terms, operations, relations, linkages = self.identifier_info_extractor.extract_from_method_name(
            name, mark_for_identifier_in_relation=node_id)

        belong_to_relations = self.graph_data.get_relations(node_id, CodeEntityRelationCategory.to_str(
            CodeEntityRelationCategory.RELATION_CATEGORY_BELONG_TO))
        if len(belong_to_relations) > 0:
            class_id = belong_to_relations.pop()[2]
            for op in operations:
                linkages.add((class_id, RelationType.HAS_OPERATION.value, op))

        return terms, operations, relations, linkages

    def handle_class_name(self, node_id, name):
        terms, relations, linkages = self.identifier_info_extractor.extract_from_class_name(name,
                                                                                            mark_for_identifier_in_relation=node_id)

        return terms, relations, linkages

    def handle_variable_name(self, node_id, name):
        terms, relations, linkages = self.identifier_info_extractor.extract_from_variable(name,
                                                                                          mark_for_identifier_in_relation=node_id)

        return terms, relations, linkages

    def extract_term_and_relation(self, term_save_path=None, operation_save_path=None, term_relation_save_path=None,
                                  linkage_save_path=None, term_aliases_save_path=None, not_fused_term_save_path=None):
        print("start extract term and relation from graph data")
        self.graph_data.print_graph_info()

        # cache the map for adding relation
        not_fused_terms = set()
        operations = set()
        relations = set()
        linkages = set()
        i = 0
        for node_id in list(self.graph_data.get_node_ids()):
            try:
                i = i + 1
                if (i % 100) == 0:
                    print("已经执行了%d次节点检索" % i)
                # else:
                #     i = i + 1
                node_json = self.graph_data.get_node_info_dict(node_id=node_id)
                if not node_json:
                    continue

                node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
                node_labels = node_json[GraphData.DEFAULT_KEY_NODE_LABELS]

                # print(len(node_labels & self.METHOD_LABELS))

                if 'sentence' in node_labels:
                    terms_, linkages_ = self.handle_description(node_id, node_properties["sentence_name"])
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)
                    continue

                if len(node_labels & self.METHOD_LABELS) > 0:
                    terms_, linkages_ = self.handle_text_in_method(node_id, node_properties)
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)

                    terms_, operations_, relations_, linkages_ = self.handle_method_name(node_id, node_properties[
                        GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME])
                    not_fused_terms.update(terms_)
                    operations.update(operations_)
                    relations.update(relations_)
                    linkages.update(linkages_)

                description = node_properties.get(PropertyConstant.DESCRIPTION, "")
                if description is not None and len(description) > 0:
                    terms_, linkages_ = self.handle_description(node_id, node_properties[PropertyConstant.DESCRIPTION])
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)

                if len(node_labels & self.CLASS_LABELS) > 0:
                    terms_, linkages_ = self.handle_comment_in_class(node_id, node_properties)
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)

                    terms_, relations_, linkages_ = self.handle_class_name(node_id, node_properties[
                        GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME])
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)
                    relations.update(relations_)

                if len(node_labels & self.VARIABLE_LABELS) > 0:
                    terms_, linkages_ = self.handle_comment_in_class(node_id, node_properties)
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)

                    terms_, relations_, linkages_ = self.handle_variable_name(node_id, node_properties[
                        GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME])
                    not_fused_terms.update(terms_)
                    linkages.update(linkages_)
                    relations.update(relations_)

            except:
                traceback.print_exc()
        new_terms = []
        for term in not_fused_terms:
            if self.valid_term(term):
                new_terms.append(term)
        not_fused_terms = set(new_terms)

        relations_ = self.detector.detect_relation_by_starfix(not_fused_terms)
        relations.update(relations_)

        print("complete domain extraction")
        term_fusion = Fusion()
        synsets = term_fusion.fuse_by_synonym(not_fused_terms)
        print("complete synonym fusion")

        fused_term_to_aliases_map = {}
        for synset in synsets:
            fused_term_to_aliases_map[synset.key] = list(synset.terms)

        fused_terms = fused_term_to_aliases_map.keys()

        new_relations = set()
        new_linkages = set()

        for start_e, relation_name, end_e in relations:
            if relation_name == "has operation":
                continue
            if relation_name == "can be operated":
                continue
            new_start_e_list = set()
            new_end_e_list = set()

            for fused_term, aliases in fused_term_to_aliases_map.items():
                if end_e in aliases:
                    new_end_e_list.add(fused_term)
                if start_e in aliases:
                    new_start_e_list.add(fused_term)

            for new_start_e in new_start_e_list:
                for new_end_e in new_end_e_list:
                    new_relations.add((new_start_e, relation_name, new_end_e))

            if len(new_start_e_list) == 0:
                new_start_e_list.add(start_e)
            if len(new_end_e_list) == 0:
                new_end_e_list.add(end_e)
        relations = new_relations

        for start_e, relation_name, end_e in linkages:
            if relation_name == "has operation":
                continue
            if relation_name == "can be operated":
                continue

            new_start_e_list = set()
            new_end_e_list = set()

            for fused_term, aliases in fused_term_to_aliases_map.items():
                if end_e in aliases:
                    new_end_e_list.add(fused_term)
                if start_e in aliases:
                    new_start_e_list.add(fused_term)
            if len(new_start_e_list) == 0:
                new_start_e_list.add(start_e)
            if len(new_end_e_list) == 0:
                new_end_e_list.add(end_e)

            for new_start_e in new_start_e_list:
                for new_end_e in new_end_e_list:
                    new_linkages.add((new_start_e, relation_name, new_end_e))

        linkages = new_linkages

        print("length of new_linkages %d" % (len(linkages)))

        # term_orgin = {k: list(v) for k, v in term_orgin.items()}

        import json

        if term_save_path is not None:
            with Path(term_save_path).open("w") as f:
                f.write("\n".join(sorted(fused_terms, key=lambda x: x)))

        if not_fused_term_save_path is not None:
            with Path(not_fused_term_save_path).open("w") as f:
                f.write("\n".join(sorted(not_fused_terms, key=lambda x: x)))

        if operation_save_path is not None:
            with Path(operation_save_path).open("w") as f:
                f.write("\n".join(sorted(operations, key=lambda x: x)))

        if term_relation_save_path is not None:
            with Path(term_relation_save_path).open("w") as f:
                json.dump(
                    [(r[0], str(r[1]), r[2]) for r in relations if self.valid_term(r[0]) and self.valid_term(r[2])], f,
                    indent=4)

        if linkage_save_path is not None:
            with Path(linkage_save_path).open("w") as f:
                json.dump([(r[0], str(r[1]), r[2]) for r in linkages if self.valid_term(r[0]) or self.valid_term(r[2])],
                          f, indent=4)

        if term_aliases_save_path is not None:
            with Path(term_aliases_save_path).open("w") as f:
                json.dump(fused_term_to_aliases_map,
                          f, indent=4)

        return fused_terms, operations, relations, linkages, fused_term_to_aliases_map

    def select_name(self, terms):
        return min(terms, key=lambda x: len(x))

    def valid_term(self, term):
        term = str(term)
        if len(term) <= 2 or term.isdigit() or (len(term) > 30 and len(term.split()) > 4):
            return False
        prefix, *rest = term.split()
        if prefix in self.STOPLIST:
            return False
        return True

    def add_domain_term(self, term, lemma, aliases):
        """
        add a new term to graph data
        :param term: the term added to GraphData
        :return: the node_id fo the added term node
        """
        if aliases == None:
            aliases = set([])
        else:
            aliases = set(list(aliases))
        aliases.add(lemma)
        aliases.add(term)

        node_labels = [DomainConstant.LABEL_DOMAIN_TERM]
        node_properties = {
            DomainConstant.PRIMARY_PROPERTY_NAME: term,
            PropertyConstant.ALIAS: aliases,
            PropertyConstant.LEMMA: lemma
        }
        domain_term_node_id = self.graph_data.add_node(node_labels=node_labels,
                                                       node_properties=node_properties,
                                                       primary_property_name=DomainConstant.PRIMARY_PROPERTY_NAME)
        return domain_term_node_id

    def add_operation(self, op, lemma):
        node_labels = [OperationConstance.LABEL_OPERATION]
        node_properties = {
            OperationConstance.PRIMARY_PROPERTY_NAME: op,
            PropertyConstant.ALIAS: {op},
            PropertyConstant.LEMMA: lemma
        }
        operation_node_id = self.graph_data.add_node(node_labels=node_labels,
                                                     node_properties=node_properties,
                                                     primary_property_name=OperationConstance.PRIMARY_PROPERTY_NAME)
        return operation_node_id

    def update_domain_node_alias(self, node_id, term):
        node_json = self.graph_data.get_node_info_dict(node_id=node_id)
        if not node_json:
            return
        node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
        alias = node_properties[PropertyConstant.ALIAS]
        alias.add(term)
        name = self.select_name(alias)
        node_json[DomainConstant.PRIMARY_PROPERTY_NAME] = name
        self.graph_data.update_node_index(node_id=node_id)

    def update_operation_node_alias(self, node_id, term):
        node_json = self.graph_data.get_node_info_dict(node_id=node_id)
        if not node_json:
            return
        node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES]
        alias = node_properties[PropertyConstant.ALIAS]
        alias.add(term)
        name = self.select_name(alias)
        node_json[OperationConstance.PRIMARY_PROPERTY_NAME] = name
        self.graph_data.update_node_index(node_id=node_id)

    def add_relation_for_same_name_operation_and_domain_term(self):
        operation_node_ids = self.graph_data.get_node_ids_by_label(OperationConstance.LABEL_OPERATION)

        for operation_id in operation_node_ids:
            operation_node = self.graph_data.get_node_info_dict(operation_id)

            operation_name = operation_node[GraphData.DEFAULT_KEY_NODE_PROPERTIES][
                OperationConstance.PRIMARY_PROPERTY_NAME]

            domain_term_node = self.graph_data.find_one_node_by_property(
                property_name=DomainConstant.PRIMARY_PROPERTY_NAME, property_value=operation_name)

            if domain_term_node == None:
                continue
            domain_term_node_id = domain_term_node[GraphData.DEFAULT_KEY_NODE_ID]

            self.graph_data.add_relation(operation_id, "corresponding concept", domain_term_node_id)
            self.graph_data.add_relation(domain_term_node_id, "corresponding operation", operation_id)

    def save(self, graph_data_path):
        self.graph_data.save(graph_data_path)

    def fuse(self, terms, operations, relations, linkages, aliases_map):
        """
        start import the term and their relation to graph
        :param term_origins:
        :param term_relations:
        :return:
        """
        # domain_graph_data = GraphData()

        self.graph_data.create_index_on_property(DomainConstant.PRIMARY_PROPERTY_NAME)
        self.graph_data.create_index_on_property(OperationConstance.PRIMARY_PROPERTY_NAME)
        self.graph_data.create_index_on_property(PropertyConstant.ALIAS)

        # todo:update the index when add
        print("start fuse with domain knowledge")
        self.graph_data.print_graph_info()

        term_lemma2id = {}
        term_name2id = {}
        op_lemma2id = {}
        op_name2id = {}

        def __add_or_update(name, is_op=False):
            if is_op:
                lemma = name.lower()
                if lemma in op_lemma2id:
                    node_id = op_lemma2id[lemma]
                    self.update_operation_node_alias(node_id, name)
                else:
                    node_id = self.add_operation(name, lemma)
                    if node_id == GraphData.UNASSIGNED_NODE_ID:
                        print("adding operation %r fail" % name)
                        return node_id
                    op_lemma2id[lemma] = node_id
                op_name2id[name] = node_id
            else:
                lemma = name.replace("-", " ").replace("  ", " ").lower()
                lemma = re.sub('([^v])([0-9]+)', r'\1 \2', lemma)
                node_id = self.add_domain_term(name, lemma, aliases=aliases_map.get(name, None))

                if node_id == GraphData.UNASSIGNED_NODE_ID:
                    print("adding domain term %r fail" % name)
                    return node_id
                term_lemma2id[lemma] = node_id
                term_name2id[name] = node_id
            return node_id

        for term in sorted(terms, key=lambda x: len(x.split())):
            __add_or_update(term)

        for op in operations:
            __add_or_update(op, is_op=True)

        def __add_relation(start_term, relation_name, end_term):
            start_name2id = term_name2id
            end_name2id = term_name2id
            start_term_is_op = False
            end_term_is_op = False
            # if relation_name.startswith("operation_"):
            #     start_name2id = op_name2id
            #     start_term_is_op = True
            if relation_name == "has operation":
                end_name2id = op_name2id
                end_term_is_op = True
            if relation_name == "instance of":
                end_name2id = op_name2id
                end_term_is_op = True
            if relation_name == "can be operated":
                end_name2id = op_name2id
                end_term_is_op = True

            if type(start_term) == int:
                start_node_id = start_term
            else:
                if start_term in start_name2id:
                    start_node_id = start_name2id[start_term]
                else:
                    start_node_id = __add_or_update(start_term, is_op=start_term_is_op)
                if start_node_id == GraphData.UNASSIGNED_NODE_ID:
                    print("adding start_domain term %r fail for relation %r" % (
                        start_term, (start_term, relation_name, end_term)))
                    return
            if type(end_term) == int:
                end_node_id = end_term
            else:
                if end_term in end_name2id:
                    end_node_id = end_name2id[end_term]
                else:
                    end_node_id = __add_or_update(end_term, is_op=end_term_is_op)
                if end_node_id == GraphData.UNASSIGNED_NODE_ID:
                    print("adding start_domain term %r fail for relation %r" % (
                        start_term, (start_term, relation_name, end_term)))
                    return

            self.graph_data.add_relation(startId=start_node_id,
                                         relationType=relation_name,
                                         endId=end_node_id)

        for (start_term, relation_name, end_term) in relations:
            __add_relation(start_term, relation_name, end_term)

        for (start_term, relation_name, end_term) in linkages:
            __add_relation(start_term, relation_name, end_term)

        isA_relations = set()
        for (start_id, _, end_id) in self.graph_data.get_relations(
                relation_type=CodeEntityRelationCategory.to_str(CodeEntityRelationCategory.RELATION_CATEGORY_EXTENDS)):
            start_domain_ids = {e for _, _, e in self.graph_data.get_relations(start_id=start_id,
                                                                               relation_type=RelationType.REPRESENT.value)}
            end_domain_ids = {e for _, _, e in self.graph_data.get_relations(start_id=start_id,
                                                                             relation_type=RelationType.REPRESENT.value)}
            for s in start_domain_ids:
                for e in end_domain_ids:
                    isA_relations.add((s, RelationType.IS_A.value, e))
        for r in isA_relations:
            __add_relation(*r)

        self.add_relation_for_same_name_operation_and_domain_term()
        print("end fuse with domain knowledge")
        self.graph_data.refresh_indexer()
        self.graph_data.print_graph_info()

    def build_aliases_for_domain_term_and_operations(self, new_all_aliases_save_path=None):
        name_util = ConceptElementNameUtil()
        domain_term_ids = self.graph_data.get_node_ids_by_label(DomainConstant.LABEL_DOMAIN_TERM)
        term_name_list = []
        fused_term_to_aliases_map = {}

        for domain_term_id in domain_term_ids:
            node_json = self.graph_data.get_node_info_dict(domain_term_id)
            term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][DomainConstant.PRIMARY_PROPERTY_NAME]
            term_name_list.append(term_name)

        for domain_term_id in domain_term_ids:
            node_json = self.graph_data.get_node_info_dict(domain_term_id)
            term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][DomainConstant.PRIMARY_PROPERTY_NAME]

            all_aliases_list = set([])

            generated_aliases = name_util.generate_aliases(term_name, vocabulary=term_name_list)
            all_aliases_list = all_aliases_list | set(generated_aliases)

            exist_aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, set([]))

            for alias in exist_aliases:
                all_aliases_list.add(alias)
                generated_aliases = name_util.generate_aliases(term_name, vocabulary=term_name_list)
                all_aliases_list = all_aliases_list | set(generated_aliases)

            node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.ALIAS] = all_aliases_list
            fused_term_to_aliases_map[term_name] = list(all_aliases_list)

        operation_ids = self.graph_data.get_node_ids_by_label(OperationConstance.LABEL_OPERATION)
        # todo: build the relation between operation and domain term
        for operation_id in operation_ids:
            node_json = self.graph_data.get_node_info_dict(operation_id)
            term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][OperationConstance.PRIMARY_PROPERTY_NAME]

            synsets = wn.synsets(term_name, pos="v")
            generated_aliases = [synset.name().split(".")[0] for synset in synsets]

            exist_aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, set([]))
            for alias in generated_aliases:
                exist_aliases.add(alias)

            node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.ALIAS] = exist_aliases

        if new_all_aliases_save_path != None:
            with Path(new_all_aliases_save_path).open("w") as f:
                json.dump(fused_term_to_aliases_map,
                          f, indent=4)

        # todo: build the aliases for operation
        self.graph_data.refresh_indexer()

    def delete_islocated_nodes_by_label(self, label):
        domain_node_ids = self.graph_data.get_node_ids_by_label(label)
        remove_ids = []
        for domain_id in domain_node_ids:
            out_ids = self.graph_data.get_all_out_relations(domain_id)
            in_ids = self.graph_data.get_all_in_relations(domain_id)
            if not out_ids and not in_ids:
                remove_ids.append(domain_id)
        print("delete %d islocated domain term" % (len(remove_ids)))
        for id in remove_ids:
            # print("remove islocated node:", self.graph_data.get_node_info_dict(id))
            self.graph_data.remove_node(id)
        return self.graph_data

    def delete_nodes_and_relations(self, name_list):
        for name in name_list:
            node_info = self.graph_data.find_one_node_by_property(DomainConstant.PRIMARY_PROPERTY_NAME, name)
            if node_info:
                node_id = node_info["id"]
                out_relations = self.graph_data.get_all_out_relations(node_id)
                in_relations = self.graph_data.get_all_in_relations(node_id)
                for s, r, e in out_relations.union(in_relations):
                    # print('delete relation %d, %s, %d' % (s, r, e))
                    self.graph_data.remove_relation(s, r, e)
                self.graph_data.remove_node(node_id)
                # print("delete node %d" % (node_id))
            else:
                print("can't find node for %s" % (name))
        return self.graph_data