예제 #1
0
def search_tm_topics(topics_list, limit, mdl_cfg):   
    '''
    Performs search on the topic model using relevant  
    topic indices 
    '''

    EPS = 1e-24 # a constant 
    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file) # loads the file paths    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) # loads the LDA theta from the model theta file 
    num_docs, num_topics = lda_theta.shape
    
    print 'LDA-theta is loaded: number of documents: ', num_docs, ' number of topics: ', num_topics  
    
    unsel_topic_idx = [idx for idx in range(0, num_topics) if idx not in topics_list]
    sel = np.log(lda_theta[:, topics_list] + EPS)
    unsel = np.log(1.0 - lda_theta[:, unsel_topic_idx] + EPS)
    ln_score = sel.sum(axis=1) + unsel.sum(axis=1)  
    sorted_idx = ln_score.argsort(axis=0)[::-1]
    # score = np.exp(ln_score)
    
    # Normalize the topic index search score 
    # TODO: this is an adhoc method right now. May come back later... 
    min_ln_score = min(ln_score)
    n_ln_score = (1.0 - ln_score / min_ln_score)

    ts_results = []
    for i in range(0, min(limit, num_docs)):
        ts_results.append([lda_file_path_index[sorted_idx[i]][0], # document id  
                          lda_file_path_index[sorted_idx[i]][1], # document directory path   
                          lda_file_path_index[sorted_idx[i]][2], # document name
                          n_ln_score[sorted_idx[i]]]) # similarity score 
        # print lda_file_path_index[sorted_idx[i]], ln_score[sorted_idx[i]], n_ln_score[sorted_idx[i]], score[sorted_idx[i]] 
        

    # grabs the files details from the index     
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    
    results = [[row[0], float(row[10])] for row in ts_results] # Note: we need a float conversion because it's retrieving as string 
    
    return results
예제 #2
0
def search_tm(query_text, limit, mdl_cfg):   

    lda_dictionary, lda_mdl, lda_index, lda_file_path_index = load_tm(mdl_cfg)
    
    ts_results = search_lda_model(query_text, lda_dictionary, lda_mdl, lda_index, lda_file_path_index, limit)
    ## ts_results are in this format  [doc_id, doc_dir_path, doc_name, score] 
    
    # grabs the files details from the index 
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    
    if len(ts_results) == 0: 
        print 'No documents found.'
        return 

    # Normalize the similarity scores 
    results = [[row[0], ((float(row[10]) + 1.0) / 2.0)] for row in ts_results]
    
    return results
예제 #3
0
def search_lsi(query_text, limit, mdl_cfg):   

    lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index = load_lsi(mdl_cfg)
    
    ts_results = search_lsi_model(query_text, lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index, limit)
    ## ts_results are in this format  [doc_id, doc_dir_path, doc_name, score] 
    
    # grabs the files details from the index 
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    
    if len(ts_results) == 0: 
        print 'No documents found.'
        return 
        
    '''
    Sahil
    Considering documents that satisfy a certain condition
    '''
    results = [[row[0], ((float(row[10]) + 1.0) / 2.0)] for row in ts_results]
    
    return results
예제 #4
0
def search_tm_sel_topics_cos(topics_list, topics_prob, limit, mdl_cfg):   

    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file)    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble)
    num_docs, num_topics = lda_theta.shape
    
    print 'Number of documents: ', num_docs, ' number of topics: ', num_topics  
    
    from scipy.spatial.distance import cosine 
    topics_prob = np.array(topics_prob)
    sel = lda_theta[:, topics_list]
    cos_scores = np.zeros(num_docs)
    for i in range(0, num_docs):
        cos_scores[i] = cosine(topics_prob, sel[i, :])

    sorted_idx = cos_scores.argsort(axis=0)[::-1]
    
    ts_results = []
    
    for i in range(0, min(limit, num_docs)):
        ts_results.append([lda_file_path_index[sorted_idx[i]][0], 
                          lda_file_path_index[sorted_idx[i]][1], 
                          lda_file_path_index[sorted_idx[i]][2], 
                          cos_scores[sorted_idx[i]]])
        print lda_file_path_index[sorted_idx[i]], cos_scores[sorted_idx[i]]
        

    # grabs the files details from the index 
    
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    results = [[row[0], row[10]] for row in ts_results]
    
    return results