Пример #1
0
    def __init__(self, graph_data):
        if isinstance(graph_data, GraphData):
            self.graph_data = graph_data
        elif isinstance(graph_data, Path):
            self.graph_data = GraphData.load(str(graph_data))
        elif isinstance(graph_data, str):
            self.graph_data = GraphData.load(graph_data)
        else:
            self.graph_data = None

        self.graph_data_reader = GraphDataReader(graph_data=self.graph_data,
                                                 node_info_factory=ProjectKGNodeInfoFactory())

        self.doc_collection = MultiFieldDocumentCollection()
Пример #2
0
def train_model(pro_name, version, first_model_config, second_model_config):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)

    sub_search_model_config = [
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=first_model_config[0]),
         first_model_config[1], first_model_config[2], False),
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=second_model_config[0]),
         second_model_config[1], second_model_config[2], True),
    ]

    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model=first_model_config[0], extra_model=second_model_config[0])

    print("try to model compound model for %r" % compound_model_name)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type=compound_model_name)

    model = CompoundSearchModel.train(
        model_dir_path=model_dir_path,
        doc_collection=doc_collection,
        sub_search_model_config=sub_search_model_config)

    return model_dir_path
Пример #3
0
 def load_doc(project_name="android27", version="v1"):
     """
     project_name: jdk8 android27
     """
     document_collection_path = PathUtil.doc(pro_name=project_name,
                                             version=version)
     return MultiFieldDocumentCollection.load(document_collection_path)
Пример #4
0
    def init(self, doc_collection):
        """
        init from a exist doc collection
        :param doc_collection: could be a str pointing the path to MultiFieldDocumentCollection. or A exist MultiFieldDocumentCollection obj.
        :return:
        """
        if doc_collection is None:
            raise Exception("init from None")
        if isinstance(doc_collection, MultiFieldDocumentCollection):
            self.doc_collection = doc_collection
        elif isinstance(doc_collection, Path):
            self.doc_collection = MultiFieldDocumentCollection.load(str(doc_collection))
        elif isinstance(doc_collection, str):
            self.doc_collection = MultiFieldDocumentCollection.load(doc_collection)
        else:
            self.doc_collection = None

        print("init complete")
Пример #5
0
 def __init__(self, graph_data_path, dc_file_location, concepts_path,
              relations_path):
     self.graph: GraphData = GraphData.load(graph_data_path)
     self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
         dc_file_location)
     with open(concepts_path) as f:
         self.concepts_list = json.load(f)
     with open(relations_path) as f:
         self.relations_list = json.load(f)
     self.concept_2_node_id = {}
def build_doc(pro_name, version):
    input_doc_collection_path = PathUtil.doc(pro_name=pro_name,
                                             version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name,
                                                      version=version,
                                                      pre_way="code-pre")
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        input_doc_collection_path)
    precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)
    precess_doc_collection.save(output_pre_doc_collection_path)
Пример #7
0
    def build_doc_for_kg(self, output_path=None):
        """
        build the doc for kg, only include aliases, out relation, description
        :return:
        """
        self.clear()
        self.build_doc()
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue

            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)

            descriptions = []
            if isinstance(node_info, CodeElementNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, DomainEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, OperationEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, WikidataEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))
            else:
                descriptions.append(doc.get_doc_text_by_field("short_description_sentences"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)
        if output_path is not None:
            sub_doc_collection.save(output_path)
        print("collection len{}".format(sub_doc_collection.get_num()))
        return sub_doc_collection
Пример #8
0
def train_model(pro_name, version):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = Preprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="bm25")
    BM25Model.train(model_dir_path, doc_collection=doc_collection)
    return model_dir_path
Пример #9
0
    def build_pre_doc(self, input_doc_collection_path, output_pre_doc_collection_path, preprocessor=None):

        if preprocessor == None:
            preprocessor = CodeDocPreprocessor()

        print("stat preprocess doc - for %s %r " % (input_doc_collection_path, preprocessor))
        doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(input_doc_collection_path)
        precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
            preprocessor=preprocessor, doc_collection=doc_collection)

        precess_doc_collection.save(output_pre_doc_collection_path)
        print("end preprocess doc - %r %r " % (output_pre_doc_collection_path, preprocessor))
Пример #10
0
def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path
Пример #11
0
 def __init__(self, pro_name, version):
     self.model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                              version=version,
                                              model_type="svm")
     self.model = FilterSemanticTFIDFNode2VectorModel(
         name="svm", model_dir_path=self.model_dir_path)
     self.document_collection_path = PathUtil.doc(pro_name, version)
     self.collection = MultiFieldDocumentCollection.load(
         str(self.document_collection_path))
     self.processor = Preprocessor()
     self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
         self.processor, self.collection)
     self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name,
                                                     version=version,
                                                     weight="unweight")
     self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version)
     self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name,
                                                  version=version,
                                                  model_type="avg_w2v")
Пример #12
0
    def extract_kg_doc_collection(self, output_path):
        """
        extract the necessary field of text as a new doc
        :param output_path:
        :return:
        """
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue

            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)

            descriptions = []
            if isinstance(node_info, CodeElementNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, DomainEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, OperationEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, WikidataEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)

        sub_doc_collection.save(output_path)
Пример #13
0
def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path
Пример #14
0
    def extract_kg_doc_collection_with_method(self, output_path, **config):
        print("start building doc")
        no_out_relation = False
        no_in_relation = False
        no_jdk = False
        method = False
        if "no_out_relation" in config.keys():
            no_out_relation = config["no_out_relation"]
        if "no_in_relation" in config.keys():
            no_in_relation = config["no_in_relation"]
        if "no_jdk" in config.keys():
            no_jdk = config["no_jdk"]
        if "with_method" in config.keys():
            method = config["with_method"]
        print("*" * 20)
        print("no_in_relation %r, no_out_relation %r, no_jdk %r, with_method %r" % (
            no_in_relation, no_out_relation, no_jdk, method))
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue
            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)
            descriptions = []
            if not method and not no_jdk:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if method and not no_jdk and "method" in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if not method and no_jdk and "jdk8" not in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if method and no_jdk and "method" in node_info.labels and "jdk8" not in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)
        sub_doc_collection.save(output_path)
Пример #15
0
from sekg.graph.exporter.graph_data import NodeInfo
import json
from definitions import OUTPUT_DIR
from pathlib import Path

pro_name = 'jabref'
dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1')
graph_data_file_location = PathUtil.graph_data(pro_name=pro_name,
                                               version='v1.8')
dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1')
comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json"
qualified_name_json_file = Path(
    OUTPUT_DIR) / "json" / "mid_2_qualified_name.json"

if __name__ == '__main__':
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        dc_file_location)
    graph_data: GraphData = GraphData.load(graph_data_file_location)

    comment_list = []
    comments = open(comment_json_file, 'r').readlines()
    for line in comments:
        comment_list.append(json.loads(line))

    qualified_name_list = []
    names = open(qualified_name_json_file, 'r').readlines()
    for line in names:
        qualified_name_list.append(json.loads(line))

    missing_count = 0
    # 根据qualified name找到graph data对应节点的api_id, 然后通过这个api_id找到doc_collection中对应的doc, 插入field和相应信息
    for item in qualified_name_list:
Пример #16
0
from flask_cors import CORS
from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument
from sekg.graph.exporter.graph_data import GraphData, NodeInfo

from project.knowledge_service import KnowledgeService
from project.doc_service import DocService
from project.json_service import JsonService
from project.utils.path_util import PathUtil

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
pro_name = "jabref"
data_dir = PathUtil.doc(pro_name=pro_name, version="v1.2")
graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.8")
graph_data: GraphData = GraphData.load(graph_data_path)
doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
    data_dir)
knowledge_service = KnowledgeService(doc_collection)
doc_service = DocService()
json_service = JsonService()


@app.route('/')
def hello():
    return 'success'


# search doc info according to method name
@app.route('/get_doc/', methods=["GET", "POST"])
def doc_info():
    if "qualified_name" not in request.json:
        return "qualified name need"
'''
将样例代码进行聚类划分并输出
'''

pro_name = "jabref"
graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.9")
doc_collection_path = PathUtil.doc(pro_name=pro_name, version="v3.2")
doc_collection_save_path = PathUtil.doc(pro_name=pro_name, version="v3.3")
api_to_example_json_path = Path(
    definitions.ROOT_DIR) / "output" / "json" / "api_2_example_sorted.json"
mid_to_method_info_json_path = Path(
    definitions.ROOT_DIR
) / "output" / "json" / "mid_2_method_info_without_comment.json"

graph_data: GraphData = GraphData.load(graph_data_path)
doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
    doc_collection_path)

# 读取sample code文件. api_to_mid: 每个api对应的sample code的mid. methods_info: 每个mid对应的代码
with open(api_to_example_json_path, 'r') as f:
    api_to_mid = json.load(f)
f.close()
methods_info = list()
methods = open(mid_to_method_info_json_path, 'r').readlines()
for method in methods:
    methods_info.append(json.loads(method)['method'])


# 根据qualified name查找得到doc文件
def find_doc(qualified_name):
    node: NodeInfo = graph_data.find_one_node_by_property(
        property_name='qualified_name', property_value=qualified_name)
Пример #18
0
 def __init__(self):
     pro_name = "jabref"
     data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3")
     self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
         data_dir)
Пример #19
0
class GraphNodeDocumentBuilder:
    """
    build the basic Node Document from a exist NodeDocument
    """

    def __init__(self, graph_data):
        if isinstance(graph_data, GraphData):
            self.graph_data = graph_data
        elif isinstance(graph_data, Path):
            self.graph_data = GraphData.load(str(graph_data))
        elif isinstance(graph_data, str):
            self.graph_data = GraphData.load(graph_data)
        else:
            self.graph_data = None

        self.graph_data_reader = GraphDataReader(graph_data=self.graph_data,
                                                 node_info_factory=ProjectKGNodeInfoFactory())

        self.doc_collection = MultiFieldDocumentCollection()

    def init(self, doc_collection):
        """
        init from a exist doc collection
        :param doc_collection: could be a str pointing the path to MultiFieldDocumentCollection. or A exist MultiFieldDocumentCollection obj.
        :return:
        """
        if doc_collection is None:
            raise Exception("init from None")
        if isinstance(doc_collection, MultiFieldDocumentCollection):
            self.doc_collection = doc_collection
        elif isinstance(doc_collection, Path):
            self.doc_collection = MultiFieldDocumentCollection.load(str(doc_collection))
        elif isinstance(doc_collection, str):
            self.doc_collection = MultiFieldDocumentCollection.load(doc_collection)
        else:
            self.doc_collection = None

        print("init complete")

    def save(self, output_path):
        self.doc_collection.save(output_path)

    def build_doc_for_code_element(self, node_info: CodeElementNodeInfo):
        all_texts = []
        properties = ["short_description", "string_literal_expr", "comment", "declare", "inside_comment"]
        for property_name in properties:
            if property_name not in node_info.properties:
                continue
            property_value = node_info.properties[property_name]
            if property_name in ["comment", "declare"]:
                str_added = None
                if type(property_value) == str:
                    str_added = self.clean_comment(property_value)
                if type(property_value) == list:
                    for value in property_value:
                        str_added += self.clean_comment(value)
                self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name=property_name,
                                                     value=str_added)
            if type(property_value) == str:
                all_texts.append(self.clean_comment(property_value))
                continue
            if type(property_value) == list:
                for value in property_value:
                    all_texts.append(self.clean_comment(value))

        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description",
                                             value=" . \n".join(all_texts))

        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases",
                                             value=" \n ".join(node_info.get_all_names()))

        self.add_text_for_out_relation(node_info.node_id)
        self.add_text_for_in_relation(node_info.node_id)

    def add_text_for_out_relation(self, node_id):
        out_relation_infos = self.graph_data_reader.get_all_out_relation_infos(node_id=node_id)
        related_sentences_text = []
        for relation_info in out_relation_infos:
            end_node_info = relation_info.end_node_info
            # relation_text = relation_info.relation_name + " " + end_node_info.get_main_name()
            relation_text = end_node_info.get_main_name()
            if relation_text is None:
                pass
            else:
                related_sentences_text.append(relation_text)

        join_relation_text = " .\n ".join(related_sentences_text)
        self.doc_collection.add_field_to_doc(doc_id=node_id, field_name="out_relations",
                                             value=join_relation_text)

    def add_text_for_in_relation(self, node_id):

        relation_infos = self.graph_data_reader.get_all_in_relation_infos(node_id=node_id)
        related_sentences_text = []
        for relation_info in relation_infos:
            try:
                start_node_info = relation_info.start_node_info
                # relation_text = start_node_info.get_main_name() + " " + relation_info.relation_name
                relation_text = start_node_info.get_main_name()
                if relation_text is None:
                    pass
                else:
                    related_sentences_text.append(relation_text)
            except:
                print("add text error:%r" % start_node_info)
                traceback.print_exc()
        join_relation_text = " .\n ".join(related_sentences_text)
        self.doc_collection.add_field_to_doc(doc_id=node_id, field_name="in_relations",
                                             value=join_relation_text)

    def clean_comment(self, description):
        try:
            sent = BeautifulSoup(description, "lxml").get_text()
            return sent.strip().strip("/*").strip("//").strip("*").strip()
        except Exception:
            traceback.print_exc()
            return ""

    def build_doc_for_domain_entity(self, node_info: DomainEntityNodeInfo):
        mention_relation_type = ["mention in comment", "mention in inside comment", "mention in short description",
                                 "mention in string literal"]
        all_mention_relations = set()
        for relation_type in mention_relation_type:
            all_mention_relations.update(set(self.graph_data.get_relations(start_id=None, relation_type=relation_type,
                                                                           end_id=node_info.node_id)))
        all_mention_info = []

        for s, r, e in all_mention_relations:
            if r.startswith("mention"):
                start_node_info = self.graph_data_reader.get_node_info(s)
                mention = None
                if r == mention_relation_type[0]:
                    mention = start_node_info.properties.get("comment", "")
                elif r == mention_relation_type[1]:
                    mention = start_node_info.properties.get("inside comment", "")
                elif r == mention_relation_type[2]:
                    mention = start_node_info.properties.get("short_description", "")
                elif r == mention_relation_type[3]:
                    mention = start_node_info.properties.get("string_literal_expr", "")
                if isinstance(mention, list):
                    all_mention_info.extend(mention)
                else:
                    all_mention_info.append(mention)

        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases",
                                             value="\n".join(node_info.get_all_names()))
        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description",
                                             value=node_info.properties.get("descriptions_en", "") + " ".join(
                                                 all_mention_info))
        self.add_text_for_out_relation(node_info.node_id)
        self.add_text_for_in_relation(node_info.node_id)

    def clear(self):
        self.doc_collection = MultiFieldDocumentCollection()

    def build_doc(self):
        self.clear()
        print("start building doc")
        for id in self.graph_data.get_node_ids():
            node_info = self.graph_data_reader.get_node_info(id)
            try:
                if node_info is None:
                    continue
                # if node_info.get_main_name() is None:
                #     continue

                doc = MultiFieldDocument(id=node_info.node_id, name=node_info.get_main_name())
                self.doc_collection.add_document(doc)
                if "code" in node_info.properties.keys():
                    data = node_info.properties["code"]
                    self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="doc",
                                                         value=data)
                if isinstance(node_info, CodeElementNodeInfo):
                    self.build_doc_for_code_element(node_info)
                elif isinstance(node_info, DomainEntityNodeInfo):
                    self.build_doc_for_domain_entity(node_info)
                elif isinstance(node_info, OperationEntityNodeInfo):
                    self.build_doc_for_operation_entity(node_info)
                elif isinstance(node_info, WikidataEntityNodeInfo):
                    self.build_doc_for_wikidata_entity(node_info)
                else:
                    self.build_doc_for_sentence(node_info)

            except:
                traceback.print_exc()
                print("build doc error %r" % node_info)

        print("end building doc")

    def build_doc_with_pure_code(self):
        print("start building doc")
        for id in self.graph_data.get_node_ids():
            node_info = self.graph_data_reader.get_node_info(id)
            try:
                if node_info is None:
                    continue
                if node_info.get_main_name() is None:
                    continue
                if "code" in node_info.properties.keys():
                    doc = MultiFieldDocument(id=node_info.node_id, name=node_info.get_main_name())
                    self.doc_collection.add_document(doc)
                    # processor = PureCodePreprocessor()
                    # data = " ".join(processor.clean(node_info.properties["code"]))
                    data = node_info.properties["code"]
                    self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="doc",
                                                         value=data)
            except:
                traceback.print_exc()
                print("build doc error %r" % node_info)
        print("end building doc")

    def extract_kg_doc_collection_with_method(self, output_path, **config):
        print("start building doc")
        no_out_relation = False
        no_in_relation = False
        no_jdk = False
        method = False
        if "no_out_relation" in config.keys():
            no_out_relation = config["no_out_relation"]
        if "no_in_relation" in config.keys():
            no_in_relation = config["no_in_relation"]
        if "no_jdk" in config.keys():
            no_jdk = config["no_jdk"]
        if "with_method" in config.keys():
            method = config["with_method"]
        print("*" * 20)
        print("no_in_relation %r, no_out_relation %r, no_jdk %r, with_method %r" % (
            no_in_relation, no_out_relation, no_jdk, method))
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue
            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)
            descriptions = []
            if not method and not no_jdk:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if method and not no_jdk and "method" in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if not method and no_jdk and "jdk8" not in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)
            if method and no_jdk and "method" in node_info.labels and "jdk8" not in node_info.labels:
                descriptions = self.add_description(node_info, doc, no_out_relation, no_in_relation)

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)
        sub_doc_collection.save(output_path)

    def add_description(self, node_info, doc, no_out_relation, no_in_relation=False):
        descriptions = []
        if isinstance(node_info, CodeElementNodeInfo):
            descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation)
        if isinstance(node_info, DomainEntityNodeInfo):
            descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation)
        if isinstance(node_info, OperationEntityNodeInfo):
            descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation)
        if isinstance(node_info, WikidataEntityNodeInfo):
            descriptions = self.add_one_description(node_info, doc, no_out_relation, no_in_relation)
        return descriptions

    def add_one_description(self, node_info, doc, no_out_relation, no_in_relation):
        descriptions = []
        descriptions.append(node_info.get_main_name())
        descriptions.append(doc.get_doc_text_by_field("aliases"))
        descriptions.append(doc.get_doc_text_by_field("description"))
        # descriptions.append(doc.get_doc_text_by_field("declare"))
        # descriptions.append(doc.get_doc_text_by_field("comment"))
        if no_out_relation:
            pass
        else:
            descriptions.append(doc.get_doc_text_by_field("out_relations"))
        if no_in_relation:
            pass
        else:
            descriptions.append(doc.get_doc_text_by_field("in_relations"))
        return descriptions

    def extract_kg_doc_collection(self, output_path):
        """
        extract the necessary field of text as a new doc
        :param output_path:
        :return:
        """
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue

            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)

            descriptions = []
            if isinstance(node_info, CodeElementNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, DomainEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, OperationEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            if isinstance(node_info, WikidataEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)

        sub_doc_collection.save(output_path)

    def build_doc_for_kg(self, output_path=None):
        """
        build the doc for kg, only include aliases, out relation, description
        :return:
        """
        self.clear()
        self.build_doc()
        sub_doc_collection = MultiFieldDocumentCollection()
        graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory())
        fail_count = 0
        for id in self.graph_data.get_node_ids():
            node_info = graph_data_reader.get_node_info(id)
            doc = self.doc_collection.get_by_id(id)
            if doc is None:
                fail_count = fail_count + 1
                continue

            new_doc = MultiFieldDocument(id=doc.id, name=doc.name)

            descriptions = []
            if isinstance(node_info, CodeElementNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, DomainEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, OperationEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))

            elif isinstance(node_info, WikidataEntityNodeInfo):
                descriptions.append(node_info.get_main_name())
                descriptions.append(doc.get_doc_text_by_field("aliases"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))
                descriptions.append(doc.get_doc_text_by_field("description"))
            else:
                descriptions.append(doc.get_doc_text_by_field("short_description_sentences"))
                descriptions.append(doc.get_doc_text_by_field("out_relations"))

            description = "\n".join([text for text in descriptions if text])
            new_doc.add_field("doc", description)
            if id % 2000 == 0:
                print("doc:%r" % description)
            sub_doc_collection.add_document(new_doc)
        if output_path is not None:
            sub_doc_collection.save(output_path)
        print("collection len{}".format(sub_doc_collection.get_num()))
        return sub_doc_collection

    def build_doc_for_sentence(self, node_info):
        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="short_description_sentences",
                                             value=node_info.properties["sentence_name"])
        self.add_text_for_out_relation(node_info.node_id)
        self.add_text_for_in_relation(node_info.node_id)

    def build_doc_for_operation_entity(self, node_info):
        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases",
                                             value="\n".join(node_info.get_all_names()))
        self.add_text_for_out_relation(node_info.node_id)
        self.add_text_for_in_relation(node_info.node_id)

    def build_doc_for_wikidata_entity(self, node_info):
        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="aliases",
                                             value="\n".join(node_info.get_all_names()))
        self.add_text_for_out_relation(node_info.node_id)
        self.add_text_for_in_relation(node_info.node_id)
        self.doc_collection.add_field_to_doc(doc_id=node_info.node_id, field_name="description",
                                             value=node_info.properties.get("descriptions_en", ""))
Пример #20
0
 def clear(self):
     self.doc_collection = MultiFieldDocumentCollection()