def get_dominant_query_topics(query_text, lda_dictionary, lda_mdl, TOP_K_TOPICS = 5): '''Tokenize the input query and finds the top K dominant query topics from an LDA model Returns: dominant_topics - a tuple list of (topic_id, topic_prob) Arguments: query_text - the query in text format lda_dictionary - the dictionary object lda_mdl - the LDA model object ''' from operator import itemgetter import heapq # process the query query_vec = lda_dictionary.doc2bow(whitespace_tokenize(query_text)) if len(query_vec) == 0: logging.exception('Query words are not in the dictionary. Exiting topic search!') return [] else: logging.info('%d query words are in the dictionary.', len(query_vec)) query_td = lda_mdl[query_vec] print 'Query TF:', [(w_id, lda_dictionary[w_id], count) for (w_id, count) in query_vec] print_dominant_query_topics(query_td, lda_mdl, TOP_K_TOPICS) dominant_topics = heapq.nlargest(TOP_K_TOPICS, dict(query_td).items(), key=itemgetter(1)) return dominant_topics # (topic_id, topic_prob)
def get_lda_query_td(doc_text, lda_dictionary, lda_mdl): '''Tokenize the input query and returns its topic distributions using the learned LDA model Returns: topic distribution Arguments: doc_text - the document in text format lda_dictionary - the dictionary object lda_mdl - the LDA model object ''' # process the query query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text)) if len(query_vec) == 0: logging.exception('Query words are not in the dictionary. Exiting topic search!') return [] else: logging.info('%d query words are in the dictionary.', len(query_vec)) query_td = lda_mdl[query_vec] return query_td
def get_lda_query_td2(doc_text, lda_dictionary, lda_beta): '''Tokenize the input query and returns its topic distributions using the learned LDA model Returns: topic distribution Arguments: doc_text - the document in text format lda_dictionary - the dictionary object lda_beta - the LDA model's beta matrix ''' # process the query query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text)) if len(query_vec) == 0: logging.exception('Query words are not in the dictionary. Exiting topic search!') return [] else: logging.info('%d query words are in the dictionary.', len(query_vec)) query_term_theta2 = np.array([lda_beta[:,vocab_id] * count for (vocab_id, count) in query_vec]).sum(axis=0) query_term_theta2 /= sum(query_term_theta2.tolist()) # normalizes query_td2 = [(idx, val) for idx, val in enumerate(query_term_theta2)] return query_td2
def compute_topic_similarities(doc_text, src_docs, lda_dictionary, lda_mdl, lda_num_topics): '''Tokenize the document and finds document similarities between the given document and the documents listed, based on topic modeling and cosine distance Returns: topic distribution Arguments: doc_text - the document in text format lda_dictionary - the dictionary object lda_mdl - the LDA model object lda_num_topics - the number topics in the LDA model ''' # process the query query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text)) query_td = lda_mdl[query_vec] qtd_vec = sparse_to_dense(lda_num_topics, query_td) print 'Query:', ' '.join(whitespace_tokenize(doc_text)) print 'Number of vocabulary tokens:', len(query_vec) print 'Query vector:', query_vec print 'Query td:', query_td print 'doc_name, cosine, rating, doc_td' dest_docs = [] for sdoc in src_docs: sdoc_text = sdoc[5] sdoc_vec = lda_dictionary.doc2bow(whitespace_tokenize(sdoc_text)) sdoc_td = lda_mdl[sdoc_vec] std_vec = sparse_to_dense(lda_num_topics, sdoc_td) cosine_dist = cosine(qtd_vec, std_vec) # ranges from -1 to 1 sdoc.append(cosine_dist) # append the cosine distance to the end print sdoc[1], cosine_dist, sdoc[-2], sdoc_td # file_id, cosine, user rating, doc topic distribution dest_docs.append(sdoc) return dest_docs
def search_lsi_model(query, dictionary, lsi, index, files_info, limit=5): '''Tokenize the input query and finds topically similar documents (responsive) using lsi based document search. Returns: a lists of lists of responsive document details, i.e., [doc_id, doc_dir_path, doc_name, score] Arguments: query - the query in text format dictionary - the dictionary object lsi - the lsi model object index - the index object files_info - the list of file details limit - the limit on the number of responsive records ''' # process the query query_vec = dictionary.doc2bow(whitespace_tokenize(query)) if len(query_vec) == 0: logging.exception('Query words are not in the dictionary. Exiting topic search!') return [] else: logging.info('%d query words are in the dictionary.', len(query_vec)) query_td = lsi[query_vec] # print 'Query vector :', [(w_id, dictionary[w_id], count) for (w_id, count) in query_vec] # print 'Query distribution:', query_td # querying based on cosine distance sims = index[query_td] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) ## Identifies responsive and non-responsive documents responsive_docs_idx = sims[0:limit] responsive_docs = [] for (doc_id, score) in responsive_docs_idx: doc = list(files_info[doc_id]) # i.e., [doc_id, doc_dir_path, doc_name] doc.append(score) responsive_docs.append(doc) return responsive_docs
def get_lda_topic_dist(docs, lda_dictionary, lda_mdl, num_topics): doc_tds = [] for doc in docs: doc_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc)) doc_tds.append(lda_mdl[doc_vec]) num_docs = len(doc_tds) theta_matrix = np.zeros((num_docs, num_topics)) count = 0 for doc in doc_tds: doc = dict(doc) theta_matrix[count, doc.keys()] = doc.values() count += 1 return theta_matrix
def process_index_doc(doc): """Processes a single email file Arguments: doc - a Document in the Lucene index """ tokens = [] if doc is not None: all_text = doc["all"] file_path = doc["file_path"] if all_text is None: logging.error("%s does not have any contents.", file_path) tokens = [] else: tokens = whitespace_tokenize(all_text) # regex_tokenizer(all_text) return tokens
def _process_doc(doc): '''Processes a single email file Arguments: doc - a Document in the Lucene index ''' tokens = [] if doc is not None: all_text = doc.get(MetadataType.ALL) file_path = doc.get(MetadataType.FILE_PATH) if all_text is None: # file_name = doc.get(MetadataType.FILE_NAME) logging.error('%s does not have any contents.', file_path) tokens = [] else: tokens = whitespace_tokenize(all_text) # regex_tokenizer(all_text)# return tokens
def search_lda_model(query_text, lda_dictionary, lda_mdl, lda_index, lda_file_path_index, limit): '''Tokenize the input query and finds topically similar documents (responsive) using the LDA based document search. Returns: responsive_docs - [doc_id, doc_dir_path, doc_name, score] Arguments: query_text - the query in text format lda_dictionary - the dictionary object lda_mdl - the LDA model object lda_index - the index object lda_file_path_index - the list of file details limit - the limit on the number of responsive records ''' # process the query query_vec = lda_dictionary.doc2bow(whitespace_tokenize(query_text)) if len(query_vec) == 0: logging.exception('Query words are not in the dictionary. Exiting topic search!') return [] else: logging.info('%d query words are in the dictionary.', len(query_vec)) query_td = lda_mdl[query_vec] # querying based on cosine distance sims = lda_index[query_td] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) ## Identifies responsive documents responsive_docs_idx = sims[0:limit] responsive_docs = [] for (doc_id, score) in responsive_docs_idx: doc = list(lda_file_path_index[doc_id]) # i.e., [doc_id, doc_dir_path, doc_name] doc.append(score) responsive_docs.append(doc) return responsive_docs