Exemplo n.º 1
0
def get_tm_classification_dataset(mdl_cfg_file, positive_dir):   
    
    mdl_cfg = read_config(mdl_cfg_file)

    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file)    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.float)
    num_docs, num_topics = lda_theta.shape
    
    print 'LDA Theta: Number of documents ', num_docs, ' number of topics ', num_topics  
    
    class_ids = np.zeros(num_docs)
    file_paths = [] 
    for i, (_, root, file_name) in enumerate(lda_file_path_index):
        if positive_dir == root: # os.path.exists(os.path.join(positive_dir, file_name)):
            class_ids[i] = RELEVANT_CLASS_ID
        else:
            class_ids[i] = IRRELEVANT_CLASS_ID
        file_paths.append(os.path.join(root, file_name))

        
    return (class_ids, lda_theta, file_paths)
def eval_ranking_varying_topics(query_id, data_dir, 
                                 keywords, 
                                 limit = 1000, 
                                 img_extension  = '.eps'):
    
    tokens = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) # Lemmatization 
    lucene_query = 'all:(%s)' % tokens # search in all fields 
    print 'Lucene query:', lucene_query
    print 'TM query:', tokens

    truth_dir = "%s%d" % (data_dir, query_id)
    positive_dir = os.path.join(truth_dir, RELEVANT_DIR_NAME) # TRUE positive documents 

    topiclda_rocs_file_name = '%d-LW-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension
    topiclda_rocs_img_title = 'Q%d (Topic-LDA): Varying # of LDA Topics and Lemmas' % query_id  
    keywordlda_rocs_file_name = '%d-LW-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension
    keywordlda_rocs_img_title = 'Q%d (Keyword-LDA): Varying # of LDA Topics and Lemmas' % query_id  
    topics = [5, 10, 15, 20, 30, 40, 50, 60, 70]
    roc_labels = []
    roc_topiclda_list = []
    roc_keywordlda_list = []

    for idx, num_topics in enumerate(topics): 

        print '------------------------------------------------------------------------------------------'
        #---------------------------------------------- Reads the configuration file
        
        config_file = "%sQ%d-LW-%dT.cfg" % (data_dir, query_id, num_topics)  # configuration file, created using the SMARTeR GUI 
        mdl_cfg = read_config(config_file)
        
        # Loads the LDA model 
        (lda_dictionary, lda_mdl, lda_index, 
         lda_file_path_index, lda_theta, 
         lda_beta) = load_lda_parameters(mdl_cfg)
        
        
        #------------ Checks whether the keywords are there in the corpus dictionary
    
        valid_tokens  = 0 
        for token in tokens.split():
            if token.strip() not in lda_dictionary.values():
                print token, "is not in the corpus vocabulary."
            else: 
                valid_tokens  += 1
                
        if valid_tokens  == 0:
            print 'None of the tokens exist in the dictionary. Exiting topic search!'
            exit()
            
        # Gets the query topic distribution from the LDA beta  
        print 'Estimated topic dist. from the LDA beta:'
        query_td2 = get_lda_query_td2(tokens, lda_dictionary, lda_beta)
        dominant_topics_idx2 = get_query_top_topic_idx(query_td2, lda_mdl, TOP_K_TOPICS)
        
        # Gets the query topic distribution from the LDA model 
        print 'Estimated topic dist. from the LDA model:'
        query_td = get_lda_query_td(tokens, lda_dictionary, lda_mdl) 
        dominant_topics_idx = get_query_top_topic_idx(query_td, lda_mdl, TOP_K_TOPICS)
    
        #------------------------------------------------------------- Lucene search
    
        if idx == 0: # the first Lucene ranking is added as a reference 
            print 'Lucene ranking'
            # lu_docs = search_li(lucene_query, limit, mdl_cfg)
            lu_docs = search_whoosh_index(lucene_query, mdl_cfg)
            _, lu_docs_list = lu_append_nonresp(lu_docs, truth_dir)
            lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
            roc_topiclda_list.append(ROCData(lu_res))
            roc_keywordlda_list.append(ROCData(lu_res))
            roc_labels.append('Lucene')
        
        #---------------------------------------------------------------- LDA search
        
#        # Gets the dominant topics from the LDA model 
#        dominant_topics = get_dominant_query_topics(tokens, lda_dictionary, lda_mdl, TOP_K_TOPICS)
#        dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
        
        
        print 'LDA (w/ keywords) ranking'
        lda_docs = search_tm2(query_td, lda_index, lda_file_path_index, limit)
        lda_res = convert_to_roc_format(lda_docs, positive_dir)
    
        print 'LDA (w/ keywords) method-2 ranking'
        lda_docs2 = search_tm2(query_td2, lda_index, lda_file_path_index, limit)
        lda_res2 = convert_to_roc_format(lda_docs2, positive_dir)
            
        
        print 'LDA (w/ query topics) ranking'
        lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) 
        lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
        
        print 'LDA (w/ query topics) method-2 ranking'
        lda_tts_docs2 = search_tm_topics(dominant_topics_idx2, limit, lda_file_path_index, lda_theta) 
        lda_tts_res2 = convert_to_roc_format(lda_tts_docs2, positive_dir)
    
        
        roc_topiclda_list.append(ROCData(lda_tts_res))
        roc_keywordlda_list.append(ROCData(lda_res))
        roc_labels.append('%d topics' % num_topics)

        roc_topiclda_list.append(ROCData(lda_tts_res2))
        roc_keywordlda_list.append(ROCData(lda_res2))
        roc_labels.append('%d topics (method-2)' % num_topics)    
        
        print '------------------------------------------------------------------------------------------'    
    
    ## Plot ROC curves  
    
    plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=topiclda_rocs_file_name)
     
    plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=keywordlda_rocs_file_name)
def eval_keywordlda_topiclda_lucene_ranking(file_prefix, config_file, 
                                    truth_dir, tokens, 
                                    limit = 1000, 
                                    img_extension  = '.eps',
                                    output_dir = ''):
    
    lucene_query = 'all:(%s)' % tokens # search in all fields 
    
    print 
    print 'Processing', file_prefix
    print 'Lucene query:', lucene_query
    print 'TM query:', tokens
    
    positive_dir = os.path.join(truth_dir, RELEVANT_DIR_NAME) # TRUE positive documents 
    
    rocs_file_name = os.path.join(output_dir, 
                                  '%s-keywordlda-topiclda-lucene-ranking-ROCs' \
                                   % file_prefix + img_extension)
    rocs_img_title = '' # %s: ROC curves' % file_prefix 
    roc_labels = ['Lucene ranking', 
                  'Keyword-LDA ranking' , 
                  'Topic-LDA ranking',
                  'Topic-LDA-2 ranking',
                  'Keyword-LDA-2 ranking']
    line_styles = ['ro-','kx-','b+-','c^-','yv-.'] 
    
    #---------------------------------------------- Reads the configuration file
    
    mdl_cfg = read_config(config_file)
    
    # Loads the LDA model 
    (lda_dictionary, lda_mdl, lda_index, 
     lda_file_path_index, lda_theta, 
     lda_beta) = load_lda_parameters(mdl_cfg)
    
    
    # Checks whether the keywords are there in the corpus dictionary

    valid_tokens  = 0 
    for token in tokens.split():
        if token.strip() not in lda_dictionary.values():
            print token, "is not in the corpus vocabulary. "
        else: 
            valid_tokens  += 1
            
    if valid_tokens  == 0:
        print 'None of the tokens exist in the dictionary. Exiting search!'
        exit()
#
#    query_vec = lda_dictionary.doc2bow(tokens.split())
#    query_term_theta2 = np.array([lda_beta[:,vocab_id] for (vocab_id, _) in query_vec]).sum(axis=0)
#    query_term_theta2 /= sum(query_term_theta2.tolist())
#    dominant_topics_idx2 = np.argsort(query_term_theta2)[::-1][:TOP_K_TOPICS]
#    
#    query_td2 = [(idx, val) for idx, val in enumerate(query_term_theta2)] 
#    
#    
    # Gets the query topic distribution from the LDA beta  
    print 'Estimated topic dist. from the LDA beta:'
    query_td2 = get_lda_query_td2(tokens, lda_dictionary, lda_beta)
    dominant_topics_idx2 = get_query_top_topic_idx(query_td2, lda_mdl, TOP_K_TOPICS)
    
    # Gets the query topic distribution from the LDA model 
    print 'Estimated topic dist. from the LDA model:'
    query_td = get_lda_query_td(tokens, lda_dictionary, lda_mdl) 
    dominant_topics_idx = get_query_top_topic_idx(query_td, lda_mdl, TOP_K_TOPICS)
    
    #------------------------------------------------------------- Lucene search

    print 'Lucene ranking'
    # lu_docs = search_li(lucene_query, limit, mdl_cfg)
    lu_docs = search_whoosh_index(lucene_query, mdl_cfg)
    _, lu_docs_list = lu_append_nonresp(lu_docs, truth_dir)
    lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
    
    
    #---------------------------------------------------------------- LDA search
    
    # To display the LDA model topics based on the 
    # increasing order of entropy   
    # print_lda_topics_on_entropy(lda_mdl, file_name='%s-topic-words.csv' % file_prefix, topn=50) 
    
#    # Gets the dominant topics from the LDA model 
#    dominant_topics = get_dominant_query_topics(tokens, lda_dictionary, lda_mdl, TOP_K_TOPICS)
#    dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
    
    print 'LDA (w/ keywords) ranking'
    lda_docs = search_tm2(query_td, lda_index, lda_file_path_index, limit)
    lda_res = convert_to_roc_format(lda_docs, positive_dir)

    print 'LDA (w/ keywords) method-2 ranking'
    lda_docs2 = search_tm2(query_td2, lda_index, lda_file_path_index, limit)
    lda_res2 = convert_to_roc_format(lda_docs2, positive_dir)
        
    
    print 'LDA (w/ query topics) ranking'
    lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, 
                                    lda_file_path_index, lda_theta) 
    lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
    
    print 'LDA (w/ query topics) method-2 ranking'
    lda_tts_docs2 = search_tm_topics(dominant_topics_idx2, limit, 
                                     lda_file_path_index, lda_theta) 
    lda_tts_res2 = convert_to_roc_format(lda_tts_docs2, positive_dir)
        
    
    ## Plot ROC curves  

    results_list = [lu_res, 
                    lda_res, 
                    lda_tts_res, 
                    lda_tts_res2,
                    lda_res2]

    roc_data_list = [ROCData(result, linestyle=line_styles[idx]) 
                     for idx, result in enumerate(results_list)]
    
    plot_multiple_roc(roc_data_list, title=rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=rocs_file_name)
     
    
    print 'The ROCs are stored in this path', rocs_file_name
    print 
    return tdm  
        
    
    
    

# 
# if __name__ == '__main__':
    
config_file = "E:\\E-Discovery\\edrmv2txt-a-b-index-t50-s\\edrmv2txt-a-b-index-t50-s.cfg"
M = 30 # number of terms used in coherence score 
topic_words_file = "top%d-topics-words.txt" % M
# topic_similarites_file = "topics-sim-M%d.txt" % M 


mdl_cfg = read_config(config_file)

# Loads the vocabulary 
vocab_file = mdl_cfg['CORPUS']['vocab_file']
vocab = dict()
with open(vocab_file) as fp:
    for vocab_id, token in enumerate(fp):
        vocab[token.strip()] = vocab_id 
lda_mdl_file = mdl_cfg['LDA']['lda_model_file']        
if nexists(lda_mdl_file): 
    lda_mdl = gensim.models.ldamodel.LdaModel.load(lda_mdl_file)


# Loads the corpus 
ldac_file = mdl_cfg['CORPUS']['blei_corpus_file']
lda_corpus = gensim.corpora.BleiCorpus(ldac_file)
def eval_ranking_methods(file_prefix, config_file, 
                         test_directory, 
                         tm_query, 
                         limit = 1000, 
                         img_extension  = '.eps'):
    
    lucene_query = 'all:(%s)' % tm_query # search in all fields 
    print 'Lucene query:', lucene_query
    print 'TM query:', tm_query
    positive_dir = os.path.join(test_directory, "1") # TRUE positive documents 
    TOP_K_TOPICS = 5 # the number topics used for Topic-LDA 
    rocs_file_name = '%s-ROCs' % file_prefix + img_extension
    rocs_img_title = '' # %s: ROC curves' % file_prefix 
    roc_labels = ['Lucene ranking', 
                  'Keyword-LDA ranking' , 
                  'Keyword-LDA * Lucene ranking', 
                  'Topic-LDA ranking' , 
                  'Topic-LDA * Lucene Ranking',
                  'Keyword-LSI ranking']
    
    line_styles = ['ro-','kx-','b+-','c^-','yv-.','gd-'] 
    
    
    #---------------------------------------------- Reads the configuration file
    
    mdl_cfg = read_config(config_file)
    
    
    #------------ Checks whether the keywords are there in the corpus dictionary
    
    dictionary = load_dictionary(mdl_cfg['CORPUS']['dict_file'])
    valid_tokens  = 0 
    for token in tm_query.split():
        if token.strip() not in dictionary.values():
            print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search."
        else: 
            valid_tokens  += 1
            
    if valid_tokens  == 0:
        print 'None of the tokens exist in the dictionary. Exiting topic search!'
        exit()
        
        
    #------------------------------------------------------------- Lucene search

    print 'Lucene ranking'
    lu_docs = search_li(lucene_query, limit, mdl_cfg)
    lu_docs_dict, lu_docs_list = lu_append_nonresp(lu_docs, test_directory)
    lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
    print 
    
    
    #---------------------------------------------------------------- LDA search
    
    # Loads the LDA model 
    lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta = load_lda_parameters(mdl_cfg)
    
    # To display the LDA model topics based on the 
    # increasing order of entropy   
    # print_lda_topics_on_entropy(lda_mdl, file_name='%s-topic-words.csv' % file_prefix, topn=50) 
    
    # Gets the dominant topics from the LDA model 
    dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS)
    dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
    
    
    print 'LDA (w/ keywords) ranking'
    lda_docs = search_tm(tm_query, limit, lda_dictionary, lda_mdl, lda_index, lda_file_path_index)
    lda_res = convert_to_roc_format(lda_docs, positive_dir)
    
    # plot_doc_class_predictions(lda_res, '%s-Keyword-LDA' % file_prefix, img_extension)
    
    
    print 'LDA (w/ keywords) * Lucene ranking'
    lu_tm_docs = fuse_lucene_tm_scores(lu_docs_dict, lda_docs)
    lda_lu_res = convert_to_roc_format(lu_tm_docs, positive_dir)
    
    # plot_doc_class_predictions(lda_lu_res, '%s-Keyword-LDA-Lucene' % file_prefix, img_extension)
    
    
    print 'LDA (w/ query topics) ranking'
    lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) 
    lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
    
    # plot_doc_class_predictions(lda_tts_res, '%s-Topic-LDA' % file_prefix, img_extension)
    
    print 'LDA (w/ query topics) * Lucene Ranking'
    final_docs_tts = fuse_lucene_tm_scores(lu_docs_dict, lda_tts_docs)
    lda_tts_lu_res = convert_to_roc_format(final_docs_tts, positive_dir)
    
    # plot_doc_class_predictions(lda_tts_lu_res, '%s-Topic-LDA-Lucene' % file_prefix, img_extension)
    
    
    
    #---------------------------------------------------------------- LSI search
    
    print 'LSI (w/ keywords) ranking'
    lsi_docs = search_lsi(tm_query, limit, mdl_cfg)
    lsi_res = convert_to_roc_format(lsi_docs, positive_dir)

    
    
    ## Plot ROC curves  

    results_list = [lu_res, 
                    lda_res, lda_lu_res, 
                    lda_tts_res, lda_tts_lu_res, 
                    lsi_res]
    
    roc_data_list = [ROCData(result, linestyle=line_styles[idx]) 
                     for idx, result in enumerate(results_list)]
    plot_multiple_roc(roc_data_list, title=rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=rocs_file_name)
Exemplo n.º 6
0
def eval_ranking_varying_topics(query_id, dir_path, 
                                 keywords, 
                                 limit = 1000, 
                                 img_extension  = '.eps'):
    
    tm_query = ' '.join( lemmatize_tokens( regex_tokenizer(keywords)  ) ) # Lemmatization 
    lucene_query = 'all:(%s)' % tm_query # search in all fields 
    print 'Lucene query:', lucene_query
    print 'TM query:', tm_query

    test_directory = "%s%d" % (dir_path, query_id)
    positive_dir = os.path.join(test_directory, "1") # TRUE positive documents 

    TOP_K_TOPICS = 5 # the number topics used for Topic-LDA 
    topiclda_rocs_file_name = '%d-LT-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension
    topiclda_rocs_img_title = 'Q%d Topic-LDA with Varying Number of Topics' % query_id  
    keywordlda_rocs_file_name = '%d-LT-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension
    keywordlda_rocs_img_title = 'Q%d Keyword-LDA with Varying Number of Topics' % query_id  
    topics = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80]
    roc_labels = []
    roc_topiclda_list = []
    roc_keywordlda_list = []

    for idx, num_topics in enumerate(topics): 

        #---------------------------------------------- Reads the configuration file
        
        config_file = "%sQ%d-LT-%dT.cfg" % (dir_path, query_id, num_topics)  # configuration file, created using the SMARTeR GUI 
        mdl_cfg = read_config(config_file)
        
        # Loads the LDA model 
        lda_dictionary, lda_mdl, _, _ = load_tm(mdl_cfg)
        
        
        #------------ Checks whether the keywords are there in the corpus dictionary
    
        valid_tokens  = 0 
        for token in tm_query.split():
            if token.strip() not in lda_dictionary.values():
                print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search."
            else: 
                valid_tokens  += 1
                
        if valid_tokens  == 0:
            print 'None of the tokens exist in the dictionary. Exiting topic search!'
            exit()
            
            
        #------------------------------------------------------------- Lucene search
    
        if idx == 0: # the first Lucene ranking is added as a reference 
            print 'Lucene ranking'
            lu_docs = search_li(lucene_query, limit, mdl_cfg)
            _, lu_docs_list = lu_append_nonresp(lu_docs, test_directory)
            lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
            roc_topiclda_list.append(ROCData(lu_res))
            roc_keywordlda_list.append(ROCData(lu_res))
            roc_labels.append('Lucene')
        
        #---------------------------------------------------------------- LDA search
        
        # Gets the dominant topics from the LDA model 
        dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS)
        dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
        
        
        print 'LDA (w/ keywords) ranking'
        lda_docs = search_tm(tm_query, limit, mdl_cfg)
        lda_res = convert_to_roc_format(lda_docs, positive_dir)
        
        
        print 'LDA (w/ query topics) ranking'
        lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, mdl_cfg) 
        lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
    
        
        roc_topiclda_list.append(ROCData(lda_tts_res))
        roc_keywordlda_list.append(ROCData(lda_res))
        roc_labels.append('%d topics' % num_topics)
        
    
    ## Plot ROC curves  
    
    plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=topiclda_rocs_file_name)
     
    plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=keywordlda_rocs_file_name)