result.path.append(p) # Add documents from topic model print("\t- Topics per-document") for key, bow, words in zip(sentence_ids, int_corpus, text_corpus): doc = result.documents.add() doc.key = key for word in words: doc.terms.append(word) for topic_idx, weight in topic_model[bow]: topic_weight = doc.topic_weights.add() topic_weight.topic = topic_idx topic_weight.weight = weight # Add topics from topic model print("\t- Words per-topic") for topic_idx in range(topic_model.num_topics): topic = result.topics.add() topic.index = topic_idx for word_index, weight in topic_model.get_topic_terms( topic_idx, config.topic_model.truncate_size, ): term_weight = topic.term_weights.add() term_weight.term = topic_model.id2word[word_index] term_weight.weight = weight with open(result_path, "wb") as proto_file: proto_file.write(result.SerializeToString()) print("Wrote result to", result_path)
def compute_topic_model( config: conf_pb.TopicQueryConfig, result: res_pb.TopicQueryResult, ) -> None: # Setup the database indices graph_db = Sqlite3Graph(config.graph_db) assert config.source in graph_db, "Failed to find source in graph_db." assert config.target in graph_db, "Failed to find target in graph_db." # Preload the graph if config.preload_graph_db: print("Loading the graph in memory") graph_db.preload() # Get Path print("Finding shortest path") path, cached_graph = path_util.get_shortest_path( graph_index=graph_db, source=config.source, target=config.target, max_degree=config.max_degree, ) if path is None: raise ValueError( f"Path is disconnected, {config.source}, {config.target}") for p in path: result.path.append(p) print("\t- p") print("Collecting Nearby Sentences") sentence_ids = set() for path_node in path: print("\t-", path_node) # Each node along the path is allowed to add some sentences sentence_ids.update( path_util.get_nearby_nodes( graph_index=graph_db, source=path_node, key_type=entity_types.SENTENCE_TYPE, max_result_size=config.max_sentences_per_path_elem, max_degree=config.max_degree, cached_graph=cached_graph, )) sentence_ids = list(sentence_ids) print("Downloading Sentence Text for all", len(sentence_ids), "sentences") bow_db = Sqlite3Bow(config.bow_db) text_corpus = [bow_db[s] for s in sentence_ids if s in bow_db] print("Pruning low-support words") min_support = config.topic_model.min_support_count term2doc_freq = bow_util.get_document_frequencies(text_corpus) stopwords_under = {t for t, c in term2doc_freq.items() if c < min_support} print( f"\t- {len(stopwords_under)} words occur less than {min_support} times" ) sentence_ids, text_corpus = bow_util.filter_words( keys=sentence_ids, text_corpus=text_corpus, stopwords=stopwords_under, ) print(f"\t- Reduced to {len(text_corpus)} documents") assert len(sentence_ids) == len(text_corpus) print("Computing topics") dictionary = Dictionary(text_corpus) int_corpus = [dictionary.doc2bow(t) for t in text_corpus] topic_model = LdaMulticore( corpus=int_corpus, id2word=dictionary, num_topics=config.topic_model.num_topics, random_state=config.topic_model.random_seed, iterations=config.topic_model.iterations, ) # Add documents from topic model print("\t- Topics per-document") for doc_id, bow, words in zip(sentence_ids, int_corpus, text_corpus): doc = result.documents.add() doc.doc_id = doc_id for topic_idx, weight in topic_model[bow]: doc.topic2weight[topic_idx] = weight # Add topics from topic model print("\t- Words per-topic") for topic_idx in range(topic_model.num_topics): topic = result.topics.add() for word_idx, weight in topic_model.get_topic_terms( topic_idx, config.topic_model.truncate_size, ): term = topic_model.id2word[word_idx] topic.term2weight[term] = weight print("\t- Adding Topical Network") aux_result_data.add_topical_network( result=result, topic_model=topic_model, dictionary=dictionary, graph_db=graph_db, bow_db=bow_db, )