def get_tm_classification_dataset(mdl_cfg_file, positive_dir): mdl_cfg = read_config(mdl_cfg_file) lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) lda_theta = np.loadtxt(lda_theta_file, dtype=np.float) num_docs, num_topics = lda_theta.shape print 'LDA Theta: Number of documents ', num_docs, ' number of topics ', num_topics class_ids = np.zeros(num_docs) file_paths = [] for i, (_, root, file_name) in enumerate(lda_file_path_index): if positive_dir == root: # os.path.exists(os.path.join(positive_dir, file_name)): class_ids[i] = RELEVANT_CLASS_ID else: class_ids[i] = IRRELEVANT_CLASS_ID file_paths.append(os.path.join(root, file_name)) return (class_ids, lda_theta, file_paths)
def eval_ranking_varying_topics(query_id, data_dir, keywords, limit = 1000, img_extension = '.eps'): tokens = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) # Lemmatization lucene_query = 'all:(%s)' % tokens # search in all fields print 'Lucene query:', lucene_query print 'TM query:', tokens truth_dir = "%s%d" % (data_dir, query_id) positive_dir = os.path.join(truth_dir, RELEVANT_DIR_NAME) # TRUE positive documents topiclda_rocs_file_name = '%d-LW-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension topiclda_rocs_img_title = 'Q%d (Topic-LDA): Varying # of LDA Topics and Lemmas' % query_id keywordlda_rocs_file_name = '%d-LW-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension keywordlda_rocs_img_title = 'Q%d (Keyword-LDA): Varying # of LDA Topics and Lemmas' % query_id topics = [5, 10, 15, 20, 30, 40, 50, 60, 70] roc_labels = [] roc_topiclda_list = [] roc_keywordlda_list = [] for idx, num_topics in enumerate(topics): print '------------------------------------------------------------------------------------------' #---------------------------------------------- Reads the configuration file config_file = "%sQ%d-LW-%dT.cfg" % (data_dir, query_id, num_topics) # configuration file, created using the SMARTeR GUI mdl_cfg = read_config(config_file) # Loads the LDA model (lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta) = load_lda_parameters(mdl_cfg) #------------ Checks whether the keywords are there in the corpus dictionary valid_tokens = 0 for token in tokens.split(): if token.strip() not in lda_dictionary.values(): print token, "is not in the corpus vocabulary." else: valid_tokens += 1 if valid_tokens == 0: print 'None of the tokens exist in the dictionary. Exiting topic search!' exit() # Gets the query topic distribution from the LDA beta print 'Estimated topic dist. from the LDA beta:' query_td2 = get_lda_query_td2(tokens, lda_dictionary, lda_beta) dominant_topics_idx2 = get_query_top_topic_idx(query_td2, lda_mdl, TOP_K_TOPICS) # Gets the query topic distribution from the LDA model print 'Estimated topic dist. from the LDA model:' query_td = get_lda_query_td(tokens, lda_dictionary, lda_mdl) dominant_topics_idx = get_query_top_topic_idx(query_td, lda_mdl, TOP_K_TOPICS) #------------------------------------------------------------- Lucene search if idx == 0: # the first Lucene ranking is added as a reference print 'Lucene ranking' # lu_docs = search_li(lucene_query, limit, mdl_cfg) lu_docs = search_whoosh_index(lucene_query, mdl_cfg) _, lu_docs_list = lu_append_nonresp(lu_docs, truth_dir) lu_res = convert_to_roc_format(lu_docs_list, positive_dir) roc_topiclda_list.append(ROCData(lu_res)) roc_keywordlda_list.append(ROCData(lu_res)) roc_labels.append('Lucene') #---------------------------------------------------------------- LDA search # # Gets the dominant topics from the LDA model # dominant_topics = get_dominant_query_topics(tokens, lda_dictionary, lda_mdl, TOP_K_TOPICS) # dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices print 'LDA (w/ keywords) ranking' lda_docs = search_tm2(query_td, lda_index, lda_file_path_index, limit) lda_res = convert_to_roc_format(lda_docs, positive_dir) print 'LDA (w/ keywords) method-2 ranking' lda_docs2 = search_tm2(query_td2, lda_index, lda_file_path_index, limit) lda_res2 = convert_to_roc_format(lda_docs2, positive_dir) print 'LDA (w/ query topics) ranking' lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir) print 'LDA (w/ query topics) method-2 ranking' lda_tts_docs2 = search_tm_topics(dominant_topics_idx2, limit, lda_file_path_index, lda_theta) lda_tts_res2 = convert_to_roc_format(lda_tts_docs2, positive_dir) roc_topiclda_list.append(ROCData(lda_tts_res)) roc_keywordlda_list.append(ROCData(lda_res)) roc_labels.append('%d topics' % num_topics) roc_topiclda_list.append(ROCData(lda_tts_res2)) roc_keywordlda_list.append(ROCData(lda_res2)) roc_labels.append('%d topics (method-2)' % num_topics) print '------------------------------------------------------------------------------------------' ## Plot ROC curves plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, labels=roc_labels, include_baseline=True, file_name=topiclda_rocs_file_name) plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, labels=roc_labels, include_baseline=True, file_name=keywordlda_rocs_file_name)
def eval_keywordlda_topiclda_lucene_ranking(file_prefix, config_file, truth_dir, tokens, limit = 1000, img_extension = '.eps', output_dir = ''): lucene_query = 'all:(%s)' % tokens # search in all fields print print 'Processing', file_prefix print 'Lucene query:', lucene_query print 'TM query:', tokens positive_dir = os.path.join(truth_dir, RELEVANT_DIR_NAME) # TRUE positive documents rocs_file_name = os.path.join(output_dir, '%s-keywordlda-topiclda-lucene-ranking-ROCs' \ % file_prefix + img_extension) rocs_img_title = '' # %s: ROC curves' % file_prefix roc_labels = ['Lucene ranking', 'Keyword-LDA ranking' , 'Topic-LDA ranking', 'Topic-LDA-2 ranking', 'Keyword-LDA-2 ranking'] line_styles = ['ro-','kx-','b+-','c^-','yv-.'] #---------------------------------------------- Reads the configuration file mdl_cfg = read_config(config_file) # Loads the LDA model (lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta) = load_lda_parameters(mdl_cfg) # Checks whether the keywords are there in the corpus dictionary valid_tokens = 0 for token in tokens.split(): if token.strip() not in lda_dictionary.values(): print token, "is not in the corpus vocabulary. " else: valid_tokens += 1 if valid_tokens == 0: print 'None of the tokens exist in the dictionary. Exiting search!' exit() # # query_vec = lda_dictionary.doc2bow(tokens.split()) # query_term_theta2 = np.array([lda_beta[:,vocab_id] for (vocab_id, _) in query_vec]).sum(axis=0) # query_term_theta2 /= sum(query_term_theta2.tolist()) # dominant_topics_idx2 = np.argsort(query_term_theta2)[::-1][:TOP_K_TOPICS] # # query_td2 = [(idx, val) for idx, val in enumerate(query_term_theta2)] # # # Gets the query topic distribution from the LDA beta print 'Estimated topic dist. from the LDA beta:' query_td2 = get_lda_query_td2(tokens, lda_dictionary, lda_beta) dominant_topics_idx2 = get_query_top_topic_idx(query_td2, lda_mdl, TOP_K_TOPICS) # Gets the query topic distribution from the LDA model print 'Estimated topic dist. from the LDA model:' query_td = get_lda_query_td(tokens, lda_dictionary, lda_mdl) dominant_topics_idx = get_query_top_topic_idx(query_td, lda_mdl, TOP_K_TOPICS) #------------------------------------------------------------- Lucene search print 'Lucene ranking' # lu_docs = search_li(lucene_query, limit, mdl_cfg) lu_docs = search_whoosh_index(lucene_query, mdl_cfg) _, lu_docs_list = lu_append_nonresp(lu_docs, truth_dir) lu_res = convert_to_roc_format(lu_docs_list, positive_dir) #---------------------------------------------------------------- LDA search # To display the LDA model topics based on the # increasing order of entropy # print_lda_topics_on_entropy(lda_mdl, file_name='%s-topic-words.csv' % file_prefix, topn=50) # # Gets the dominant topics from the LDA model # dominant_topics = get_dominant_query_topics(tokens, lda_dictionary, lda_mdl, TOP_K_TOPICS) # dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices print 'LDA (w/ keywords) ranking' lda_docs = search_tm2(query_td, lda_index, lda_file_path_index, limit) lda_res = convert_to_roc_format(lda_docs, positive_dir) print 'LDA (w/ keywords) method-2 ranking' lda_docs2 = search_tm2(query_td2, lda_index, lda_file_path_index, limit) lda_res2 = convert_to_roc_format(lda_docs2, positive_dir) print 'LDA (w/ query topics) ranking' lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir) print 'LDA (w/ query topics) method-2 ranking' lda_tts_docs2 = search_tm_topics(dominant_topics_idx2, limit, lda_file_path_index, lda_theta) lda_tts_res2 = convert_to_roc_format(lda_tts_docs2, positive_dir) ## Plot ROC curves results_list = [lu_res, lda_res, lda_tts_res, lda_tts_res2, lda_res2] roc_data_list = [ROCData(result, linestyle=line_styles[idx]) for idx, result in enumerate(results_list)] plot_multiple_roc(roc_data_list, title=rocs_img_title, labels=roc_labels, include_baseline=True, file_name=rocs_file_name) print 'The ROCs are stored in this path', rocs_file_name print
return tdm # # if __name__ == '__main__': config_file = "E:\\E-Discovery\\edrmv2txt-a-b-index-t50-s\\edrmv2txt-a-b-index-t50-s.cfg" M = 30 # number of terms used in coherence score topic_words_file = "top%d-topics-words.txt" % M # topic_similarites_file = "topics-sim-M%d.txt" % M mdl_cfg = read_config(config_file) # Loads the vocabulary vocab_file = mdl_cfg['CORPUS']['vocab_file'] vocab = dict() with open(vocab_file) as fp: for vocab_id, token in enumerate(fp): vocab[token.strip()] = vocab_id lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] if nexists(lda_mdl_file): lda_mdl = gensim.models.ldamodel.LdaModel.load(lda_mdl_file) # Loads the corpus ldac_file = mdl_cfg['CORPUS']['blei_corpus_file'] lda_corpus = gensim.corpora.BleiCorpus(ldac_file)
def eval_ranking_methods(file_prefix, config_file, test_directory, tm_query, limit = 1000, img_extension = '.eps'): lucene_query = 'all:(%s)' % tm_query # search in all fields print 'Lucene query:', lucene_query print 'TM query:', tm_query positive_dir = os.path.join(test_directory, "1") # TRUE positive documents TOP_K_TOPICS = 5 # the number topics used for Topic-LDA rocs_file_name = '%s-ROCs' % file_prefix + img_extension rocs_img_title = '' # %s: ROC curves' % file_prefix roc_labels = ['Lucene ranking', 'Keyword-LDA ranking' , 'Keyword-LDA * Lucene ranking', 'Topic-LDA ranking' , 'Topic-LDA * Lucene Ranking', 'Keyword-LSI ranking'] line_styles = ['ro-','kx-','b+-','c^-','yv-.','gd-'] #---------------------------------------------- Reads the configuration file mdl_cfg = read_config(config_file) #------------ Checks whether the keywords are there in the corpus dictionary dictionary = load_dictionary(mdl_cfg['CORPUS']['dict_file']) valid_tokens = 0 for token in tm_query.split(): if token.strip() not in dictionary.values(): print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search." else: valid_tokens += 1 if valid_tokens == 0: print 'None of the tokens exist in the dictionary. Exiting topic search!' exit() #------------------------------------------------------------- Lucene search print 'Lucene ranking' lu_docs = search_li(lucene_query, limit, mdl_cfg) lu_docs_dict, lu_docs_list = lu_append_nonresp(lu_docs, test_directory) lu_res = convert_to_roc_format(lu_docs_list, positive_dir) print #---------------------------------------------------------------- LDA search # Loads the LDA model lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta = load_lda_parameters(mdl_cfg) # To display the LDA model topics based on the # increasing order of entropy # print_lda_topics_on_entropy(lda_mdl, file_name='%s-topic-words.csv' % file_prefix, topn=50) # Gets the dominant topics from the LDA model dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS) dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices print 'LDA (w/ keywords) ranking' lda_docs = search_tm(tm_query, limit, lda_dictionary, lda_mdl, lda_index, lda_file_path_index) lda_res = convert_to_roc_format(lda_docs, positive_dir) # plot_doc_class_predictions(lda_res, '%s-Keyword-LDA' % file_prefix, img_extension) print 'LDA (w/ keywords) * Lucene ranking' lu_tm_docs = fuse_lucene_tm_scores(lu_docs_dict, lda_docs) lda_lu_res = convert_to_roc_format(lu_tm_docs, positive_dir) # plot_doc_class_predictions(lda_lu_res, '%s-Keyword-LDA-Lucene' % file_prefix, img_extension) print 'LDA (w/ query topics) ranking' lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir) # plot_doc_class_predictions(lda_tts_res, '%s-Topic-LDA' % file_prefix, img_extension) print 'LDA (w/ query topics) * Lucene Ranking' final_docs_tts = fuse_lucene_tm_scores(lu_docs_dict, lda_tts_docs) lda_tts_lu_res = convert_to_roc_format(final_docs_tts, positive_dir) # plot_doc_class_predictions(lda_tts_lu_res, '%s-Topic-LDA-Lucene' % file_prefix, img_extension) #---------------------------------------------------------------- LSI search print 'LSI (w/ keywords) ranking' lsi_docs = search_lsi(tm_query, limit, mdl_cfg) lsi_res = convert_to_roc_format(lsi_docs, positive_dir) ## Plot ROC curves results_list = [lu_res, lda_res, lda_lu_res, lda_tts_res, lda_tts_lu_res, lsi_res] roc_data_list = [ROCData(result, linestyle=line_styles[idx]) for idx, result in enumerate(results_list)] plot_multiple_roc(roc_data_list, title=rocs_img_title, labels=roc_labels, include_baseline=True, file_name=rocs_file_name)
def eval_ranking_varying_topics(query_id, dir_path, keywords, limit = 1000, img_extension = '.eps'): tm_query = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) # Lemmatization lucene_query = 'all:(%s)' % tm_query # search in all fields print 'Lucene query:', lucene_query print 'TM query:', tm_query test_directory = "%s%d" % (dir_path, query_id) positive_dir = os.path.join(test_directory, "1") # TRUE positive documents TOP_K_TOPICS = 5 # the number topics used for Topic-LDA topiclda_rocs_file_name = '%d-LT-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension topiclda_rocs_img_title = 'Q%d Topic-LDA with Varying Number of Topics' % query_id keywordlda_rocs_file_name = '%d-LT-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension keywordlda_rocs_img_title = 'Q%d Keyword-LDA with Varying Number of Topics' % query_id topics = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80] roc_labels = [] roc_topiclda_list = [] roc_keywordlda_list = [] for idx, num_topics in enumerate(topics): #---------------------------------------------- Reads the configuration file config_file = "%sQ%d-LT-%dT.cfg" % (dir_path, query_id, num_topics) # configuration file, created using the SMARTeR GUI mdl_cfg = read_config(config_file) # Loads the LDA model lda_dictionary, lda_mdl, _, _ = load_tm(mdl_cfg) #------------ Checks whether the keywords are there in the corpus dictionary valid_tokens = 0 for token in tm_query.split(): if token.strip() not in lda_dictionary.values(): print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search." else: valid_tokens += 1 if valid_tokens == 0: print 'None of the tokens exist in the dictionary. Exiting topic search!' exit() #------------------------------------------------------------- Lucene search if idx == 0: # the first Lucene ranking is added as a reference print 'Lucene ranking' lu_docs = search_li(lucene_query, limit, mdl_cfg) _, lu_docs_list = lu_append_nonresp(lu_docs, test_directory) lu_res = convert_to_roc_format(lu_docs_list, positive_dir) roc_topiclda_list.append(ROCData(lu_res)) roc_keywordlda_list.append(ROCData(lu_res)) roc_labels.append('Lucene') #---------------------------------------------------------------- LDA search # Gets the dominant topics from the LDA model dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS) dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices print 'LDA (w/ keywords) ranking' lda_docs = search_tm(tm_query, limit, mdl_cfg) lda_res = convert_to_roc_format(lda_docs, positive_dir) print 'LDA (w/ query topics) ranking' lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, mdl_cfg) lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir) roc_topiclda_list.append(ROCData(lda_tts_res)) roc_keywordlda_list.append(ROCData(lda_res)) roc_labels.append('%d topics' % num_topics) ## Plot ROC curves plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, labels=roc_labels, include_baseline=True, file_name=topiclda_rocs_file_name) plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, labels=roc_labels, include_baseline=True, file_name=keywordlda_rocs_file_name)