예제 #1
0
class EntityVectorGenerator:
    def __init__(self):
        self.session = None
        self.graphClient = None
        self.entity_vector_model = None

    def init(self, path="word2vec_api_software_wiki.txt", binary=True):
        self.session = EngineFactory.create_session()
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4))

        self.entity_vector_model = EntityVectorComputeModel()
        self.entity_vector_model.init_word2vec_model(path=path, binary=binary)
        print("init complete")

    def get_content_for_wikidata_node(self, node):
        content = ""
        node_dict = dict(node)
        if 'site:enwiki' in node_dict:
            title = URLUtil.parse_url_to_title(node["site:enwiki"])
            wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title(
                self.session, title)
            if wikipedia_doc is not None:
                return wikipedia_doc.content
            else:
                content = content + " " + title

        property_list = ['labels_en', 'descriptions_en', 'aliases_en']
        for key in property_list:
            if key not in node_dict:
                continue
            if type(node[key]) == list:
                content = content + " " + " ".join(node[key])
            else:
                content = content + " " + node[key]

        if content is '':
            return None
        print("content: ", content)
        return content

    def start_generate_wikipedia_vector(self,
                                        output_path="wikipedia.plain.txt"):

        label = "wikipedia"

        wiki_nodes = self.graphClient.get_all_nodes_by_label(label)
        data_list = []
        for node in wiki_nodes:
            content = self.get_content_for_wikidata_node(node=node)
            if content is None:
                print("------None-----")
                continue
            item = {
                "id": "kg#" + str(self.graphClient.get_id_for_node(node)),
                "text": content
            }
            data_list.append(item)

        self.entity_vector_model.train_mean_vector_from_corpus(
            data_set=data_list, output_path=output_path)

    def start_generate_sentence_vector(self, output_path="sentence.plain.txt"):
        session = self.session
        sentence_list = DocumentSentenceText.get_all_valid_sentences(session)
        data_list = []
        for each in sentence_list:
            if each.id is not None and each.text is not None:
                item = {"id": each.id, "text": each.text}
                data_list.append(item)
        self.entity_vector_model.train_mean_vector_from_corpus(
            data_set=data_list, output_path=output_path)

    def start_generate_domain_entity_vector(
            self, output_path="domain_entity.plain.txt"):
        domain_entity_data = DomainEntity.get_all_domain_entities(self.session)

        data_list = []
        for each in domain_entity_data:
            if each.id is not None and each.description is not None:
                item = {
                    "id": each.id,
                    "text": each.name + " " + each.description
                }
                data_list.append(item)
        self.entity_vector_model.train_mean_vector_from_corpus(
            data_set=data_list, output_path=output_path)

    def start_generate_api_entity_vector(self, output_path="api.plain.txt"):
        api_id_list = APIEntity.get_api_id_and_qualified_name_list(
            self.session)

        if api_id_list is not None:
            data_list = []
            for each in api_id_list:
                api_id = each.id
                api_name = each.qualified_name
                try:
                    api_name_simple_name = api_name.split("(")[0].split(
                        ".")[-1]
                except:
                    api_name_simple_name = ""

                # api_clean_text_data = APIHTMLText.get_text_by_api_id_and_type(self.session, api_id,
                #                                                               APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION)

                api_html_text = APIHTMLText.get_html_text_id(
                    self.session, api_id,
                    APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION)
                if api_html_text is None:
                    continue
                document_text = DocumentText.get_by_html_text_id(
                    self.session, api_html_text.id)
                if document_text is None:
                    continue

                paragraph_text = DocumentParagraphText.get_first_by_doc_id(
                    self.session, document_text.id)
                if paragraph_text is None:
                    continue
                if paragraph_text is not None:
                    api_clean_text = paragraph_text.text
                    final_text = api_name_simple_name + " " + api_name + " " + api_clean_text
                    item = {"id": api_id, "text": final_text}
                    data_list.append(item)
            self.entity_vector_model.train_mean_vector_from_corpus(
                data_set=data_list, output_path=output_path)

    def start_generate_paragraph_vector(
            self, output_path="mean_vector_api_paragraph.plain.txt"):
        paragraph_list = DocumentParagraphText.get_all_paragraph_text(
            session=self.session)
        text_data_set = []

        for paragraph in paragraph_list:
            text = paragraph.text
            if text is None or len(text.strip()) <= 2:
                continue
            text = text.strip()
            item = {"id": paragraph.id, "text": text}
            text_data_set.append(item)

        self.entity_vector_model.train_mean_vector_from_corpus(
            data_set=text_data_set, output_path=output_path)
class WikiAliasDBImporter:
    def __init__(self):
        self.graphClient = None
        self.session = None

    def init(self):
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4))
        self.session = EngineFactory.create_session()
        print("init complete")

    def clean_table(self):
        WikipediaEntityName.delete_all(self.session)
        WikipediaEntityNameToWikipediaMapping.delete_all(self.session)

        print("delete all exist table")

    def start_import_wiki_aliases_to_db(self):
        label = "wikipedia"
        wiki_nodes = self.graphClient.get_all_nodes_by_label(label)

        for node in wiki_nodes:
            node_id = self.graphClient.get_id_for_node(node)
            # print ('node_id: %r', node_id)
            # name, site_enwiki, labels_ = ''
            name_set = set([])
            if 'name' in dict(node):
                # print ("name: %r", node['name'])
                if isinstance(node['name'], list):
                    for each in node['name']:
                        name_set.add(each)
                else:
                    name_set.add(node['name'])
            if 'site:enwiki' in dict(node):
                # print ('site_enwiki: %s', node['site:enwiki'])
                if isinstance(node['site:enwiki'], list):
                    for each in node['site:enwiki']:
                        title = URLUtil.parse_url_to_title(each)
                        # print ('site_name: %r', title)
                        name_set.add(title)
                else:
                    title = URLUtil.parse_url_to_title(node["site:enwiki"])
                    # print ('site_name: %r', title)
                    name_set.add(title)
            if 'labels_en' in dict(node):
                # print( "labels_en: ", node['labels_en'])
                if isinstance(node['labels_en'], list):
                    for each in node['labels_en']:
                        name_set.add(each)
                else:
                    name_set.add(node['labels_en'])
            if 'aliases_en' in dict(node):
                # print("aliases_en: ", node['aliases_en'])
                for each in node['aliases_en']:
                    name_set.add(each)
            # print (name_set)
            for name in name_set:
                try:
                    wikipedia_entity_name = WikipediaEntityName(
                        node_id, str(name))
                    wikipedia_entity_name.find_or_create(self.session,
                                                         autocommit=True)
                except Exception:
                    traceback.print_exc()
            # self.session.commit()
        self.session.commit()

    def start_generate_wiki_entity_text_map(self):
        wikipedia_entity_name_data = WikipediaEntityName.get_all_wikipedia_names(
            self.session)
        kg_id_list = set([])
        for each in wikipedia_entity_name_data:
            if each is not None:
                kg_id_list.add(each.kg_id)
        # print kg_id_list
        for kg_id in kg_id_list:
            node = self.graphClient.find_node_by_id(kg_id)
            if node is not None:
                if "site:enwiki" in dict(node):
                    title = URLUtil.parse_url_to_title(node["site:enwiki"])
                    wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title(
                        self.session, title)
                    if wikipedia_doc is not None:
                        wikipedia_id = wikipedia_doc.id
                        wiki_name_to_wikipedia_mapping = WikipediaEntityNameToWikipediaMapping(
                            kg_id, wikipedia_id)
                        wiki_name_to_wikipedia_mapping.find_or_create(
                            self.session, autocommit=False)
        self.session.commit()

    def start_import(self):
        self.init()
        self.clean_table()
        self.start_import_wiki_aliases_to_db()
        self.start_generate_wiki_entity_text_map()