class EntityVectorGenerator: def __init__(self): self.session = None self.graphClient = None self.entity_vector_model = None def init(self, path="word2vec_api_software_wiki.txt", binary=True): self.session = EngineFactory.create_session() self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4)) self.entity_vector_model = EntityVectorComputeModel() self.entity_vector_model.init_word2vec_model(path=path, binary=binary) print("init complete") def get_content_for_wikidata_node(self, node): content = "" node_dict = dict(node) if 'site:enwiki' in node_dict: title = URLUtil.parse_url_to_title(node["site:enwiki"]) wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title( self.session, title) if wikipedia_doc is not None: return wikipedia_doc.content else: content = content + " " + title property_list = ['labels_en', 'descriptions_en', 'aliases_en'] for key in property_list: if key not in node_dict: continue if type(node[key]) == list: content = content + " " + " ".join(node[key]) else: content = content + " " + node[key] if content is '': return None print("content: ", content) return content def start_generate_wikipedia_vector(self, output_path="wikipedia.plain.txt"): label = "wikipedia" wiki_nodes = self.graphClient.get_all_nodes_by_label(label) data_list = [] for node in wiki_nodes: content = self.get_content_for_wikidata_node(node=node) if content is None: print("------None-----") continue item = { "id": "kg#" + str(self.graphClient.get_id_for_node(node)), "text": content } data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_sentence_vector(self, output_path="sentence.plain.txt"): session = self.session sentence_list = DocumentSentenceText.get_all_valid_sentences(session) data_list = [] for each in sentence_list: if each.id is not None and each.text is not None: item = {"id": each.id, "text": each.text} data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_domain_entity_vector( self, output_path="domain_entity.plain.txt"): domain_entity_data = DomainEntity.get_all_domain_entities(self.session) data_list = [] for each in domain_entity_data: if each.id is not None and each.description is not None: item = { "id": each.id, "text": each.name + " " + each.description } data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_api_entity_vector(self, output_path="api.plain.txt"): api_id_list = APIEntity.get_api_id_and_qualified_name_list( self.session) if api_id_list is not None: data_list = [] for each in api_id_list: api_id = each.id api_name = each.qualified_name try: api_name_simple_name = api_name.split("(")[0].split( ".")[-1] except: api_name_simple_name = "" # api_clean_text_data = APIHTMLText.get_text_by_api_id_and_type(self.session, api_id, # APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION) api_html_text = APIHTMLText.get_html_text_id( self.session, api_id, APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION) if api_html_text is None: continue document_text = DocumentText.get_by_html_text_id( self.session, api_html_text.id) if document_text is None: continue paragraph_text = DocumentParagraphText.get_first_by_doc_id( self.session, document_text.id) if paragraph_text is None: continue if paragraph_text is not None: api_clean_text = paragraph_text.text final_text = api_name_simple_name + " " + api_name + " " + api_clean_text item = {"id": api_id, "text": final_text} data_list.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=data_list, output_path=output_path) def start_generate_paragraph_vector( self, output_path="mean_vector_api_paragraph.plain.txt"): paragraph_list = DocumentParagraphText.get_all_paragraph_text( session=self.session) text_data_set = [] for paragraph in paragraph_list: text = paragraph.text if text is None or len(text.strip()) <= 2: continue text = text.strip() item = {"id": paragraph.id, "text": text} text_data_set.append(item) self.entity_vector_model.train_mean_vector_from_corpus( data_set=text_data_set, output_path=output_path)
class WikiAliasDBImporter: def __init__(self): self.graphClient = None self.session = None def init(self): self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4)) self.session = EngineFactory.create_session() print("init complete") def clean_table(self): WikipediaEntityName.delete_all(self.session) WikipediaEntityNameToWikipediaMapping.delete_all(self.session) print("delete all exist table") def start_import_wiki_aliases_to_db(self): label = "wikipedia" wiki_nodes = self.graphClient.get_all_nodes_by_label(label) for node in wiki_nodes: node_id = self.graphClient.get_id_for_node(node) # print ('node_id: %r', node_id) # name, site_enwiki, labels_ = '' name_set = set([]) if 'name' in dict(node): # print ("name: %r", node['name']) if isinstance(node['name'], list): for each in node['name']: name_set.add(each) else: name_set.add(node['name']) if 'site:enwiki' in dict(node): # print ('site_enwiki: %s', node['site:enwiki']) if isinstance(node['site:enwiki'], list): for each in node['site:enwiki']: title = URLUtil.parse_url_to_title(each) # print ('site_name: %r', title) name_set.add(title) else: title = URLUtil.parse_url_to_title(node["site:enwiki"]) # print ('site_name: %r', title) name_set.add(title) if 'labels_en' in dict(node): # print( "labels_en: ", node['labels_en']) if isinstance(node['labels_en'], list): for each in node['labels_en']: name_set.add(each) else: name_set.add(node['labels_en']) if 'aliases_en' in dict(node): # print("aliases_en: ", node['aliases_en']) for each in node['aliases_en']: name_set.add(each) # print (name_set) for name in name_set: try: wikipedia_entity_name = WikipediaEntityName( node_id, str(name)) wikipedia_entity_name.find_or_create(self.session, autocommit=True) except Exception: traceback.print_exc() # self.session.commit() self.session.commit() def start_generate_wiki_entity_text_map(self): wikipedia_entity_name_data = WikipediaEntityName.get_all_wikipedia_names( self.session) kg_id_list = set([]) for each in wikipedia_entity_name_data: if each is not None: kg_id_list.add(each.kg_id) # print kg_id_list for kg_id in kg_id_list: node = self.graphClient.find_node_by_id(kg_id) if node is not None: if "site:enwiki" in dict(node): title = URLUtil.parse_url_to_title(node["site:enwiki"]) wikipedia_doc = WikipediaDocument.get_document_by_wikipedia_title( self.session, title) if wikipedia_doc is not None: wikipedia_id = wikipedia_doc.id wiki_name_to_wikipedia_mapping = WikipediaEntityNameToWikipediaMapping( kg_id, wikipedia_id) wiki_name_to_wikipedia_mapping.find_or_create( self.session, autocommit=False) self.session.commit() def start_import(self): self.init() self.clean_table() self.start_import_wiki_aliases_to_db() self.start_generate_wiki_entity_text_map()