Python whitespace_tokenize示例，utils.utils_email.whitespace_tokenize Python示例

示例#1

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def get_dominant_query_topics(query_text, lda_dictionary, lda_mdl, TOP_K_TOPICS = 5):
    '''Tokenize the input query and finds the top K dominant query 
    topics from an LDA model 
    
    Returns:
        dominant_topics - a tuple list of (topic_id, topic_prob)
    Arguments:
        query_text - the query in text format 
        lda_dictionary - the dictionary object 
        lda_mdl - the LDA model object 
    
    '''
    from operator import itemgetter
    import heapq
    
    # process the query 
    
    query_vec = lda_dictionary.doc2bow(whitespace_tokenize(query_text))
    
    if len(query_vec) == 0: 
        logging.exception('Query words are not in the dictionary. Exiting topic search!')
        return [] 
    else: 
        logging.info('%d query words are in the dictionary.', len(query_vec))
    
    query_td = lda_mdl[query_vec]

    print 'Query TF:', [(w_id, lda_dictionary[w_id], count) for (w_id, count) in query_vec]
    print_dominant_query_topics(query_td, lda_mdl, TOP_K_TOPICS)
   
    dominant_topics = heapq.nlargest(TOP_K_TOPICS, dict(query_td).items(), key=itemgetter(1))

    return dominant_topics # (topic_id, topic_prob)

示例#2

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def get_lda_query_td(doc_text, lda_dictionary, lda_mdl):
    '''Tokenize the input query and returns its topic 
    distributions using the learned LDA model
    
    Returns:
        topic distribution 
    Arguments:
        doc_text - the document in text format 
        lda_dictionary - the dictionary object 
        lda_mdl - the LDA model object 
    
    '''
    # process the query 
    
    query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text))
    
    if len(query_vec) == 0: 
        logging.exception('Query words are not in the dictionary. Exiting topic search!')
        return [] 
    else: 
        logging.info('%d query words are in the dictionary.', len(query_vec))
    
    query_td = lda_mdl[query_vec]
    
    return query_td

示例#3

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def get_lda_query_td2(doc_text, lda_dictionary, lda_beta):
    '''Tokenize the input query and returns its topic 
    distributions using the learned LDA model
    
    Returns:
        topic distribution 
    Arguments:
        doc_text - the document in text format 
        lda_dictionary - the dictionary object 
        lda_beta - the LDA model's beta matrix  
    
    '''
    # process the query 
    
    query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text))
    
    if len(query_vec) == 0: 
        logging.exception('Query words are not in the dictionary. Exiting topic search!')
        return [] 
    else: 
        logging.info('%d query words are in the dictionary.', len(query_vec))
    
    query_term_theta2 = np.array([lda_beta[:,vocab_id] * count for (vocab_id, count) in query_vec]).sum(axis=0)
    query_term_theta2 /= sum(query_term_theta2.tolist()) # normalizes
    query_td2 = [(idx, val) for idx, val in enumerate(query_term_theta2)] 
    
    return query_td2

示例#4

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def compute_topic_similarities(doc_text, src_docs, lda_dictionary, lda_mdl, lda_num_topics):
    '''Tokenize the document and finds document similarities between
    the given document and the documents listed, based on topic modeling 
    and cosine distance 
    
    Returns:
        topic distribution 
    Arguments:
        doc_text - the document in text format 
        lda_dictionary - the dictionary object 
        lda_mdl - the LDA model object 
        lda_num_topics - the number topics in the LDA model 
    
    '''
    # process the query 
    
    query_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc_text))
    query_td = lda_mdl[query_vec]
    qtd_vec = sparse_to_dense(lda_num_topics, query_td)
    
    print 'Query:', ' '.join(whitespace_tokenize(doc_text))
    print 'Number of vocabulary tokens:', len(query_vec)
    print 'Query vector:', query_vec
    print 'Query td:', query_td
    print 'doc_name, cosine, rating, doc_td'
    
    dest_docs = []
    for sdoc in src_docs:
        sdoc_text = sdoc[5]
        sdoc_vec = lda_dictionary.doc2bow(whitespace_tokenize(sdoc_text))
        sdoc_td = lda_mdl[sdoc_vec]
        std_vec = sparse_to_dense(lda_num_topics, sdoc_td)
        cosine_dist = cosine(qtd_vec, std_vec) # ranges from -1 to 1 
        sdoc.append(cosine_dist) # append the cosine distance to the end 
        print sdoc[1], cosine_dist, sdoc[-2], sdoc_td # file_id, cosine, user rating, doc topic distribution 
        dest_docs.append(sdoc)
        
    return dest_docs

示例#5

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def search_lsi_model(query, dictionary, lsi, index, files_info, limit=5):
    '''Tokenize the input query and finds topically 
    similar documents (responsive) using lsi based 
    document search. 
    
    Returns:
        a lists of lists of responsive document details,
        i.e., [doc_id, doc_dir_path, doc_name, score] 
    Arguments:
        query - the query in text format 
        dictionary - the dictionary object 
        lsi - the lsi model object 
        index - the index object 
        files_info - the list of file details 
        limit - the limit on the number of responsive records 
    
    '''

    # process the query 
    
    query_vec = dictionary.doc2bow(whitespace_tokenize(query))
    if len(query_vec) == 0: 
        logging.exception('Query words are not in the dictionary. Exiting topic search!')
        return [] 
    else: 
        logging.info('%d query words are in the dictionary.', len(query_vec)) 
    
    
    query_td = lsi[query_vec]
    
    # print 'Query vector      :', [(w_id, dictionary[w_id], count) for (w_id, count) in query_vec]
    # print 'Query distribution:', query_td
    
    # querying based on cosine distance
    
    sims = index[query_td] # perform a similarity query against the corpus
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    ## Identifies responsive and non-responsive documents
     
    responsive_docs_idx = sims[0:limit]
    
    responsive_docs = [] 
    for (doc_id, score) in responsive_docs_idx: 
        doc = list(files_info[doc_id]) # i.e., [doc_id, doc_dir_path, doc_name]
        doc.append(score)
        responsive_docs.append(doc)
    
    return responsive_docs

示例#6

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def get_lda_topic_dist(docs, lda_dictionary, lda_mdl, num_topics):
    
    doc_tds = [] 
    for doc in docs: 
        doc_vec = lda_dictionary.doc2bow(whitespace_tokenize(doc))
        doc_tds.append(lda_mdl[doc_vec])
    
    num_docs = len(doc_tds)
    theta_matrix = np.zeros((num_docs, num_topics))
    count = 0
    for doc in doc_tds: 
        doc = dict(doc)
        theta_matrix[count, doc.keys()] = doc.values()
        count += 1 

    return theta_matrix

示例#7

0

显示文件

文件： whoosh_enron_create_corpus.py 项目： clintpgeorge/ediscovery

def process_index_doc(doc):
    """Processes a single email file 
    
    Arguments: 
        doc - a Document in the Lucene index  
    """
    tokens = []
    if doc is not None:
        all_text = doc["all"]
        file_path = doc["file_path"]
        if all_text is None:
            logging.error("%s does not have any contents.", file_path)
            tokens = []
        else:
            tokens = whitespace_tokenize(all_text)  # regex_tokenizer(all_text)

    return tokens

示例#8

0

显示文件

文件： lucene_enron_create_corpus.py 项目： clintpgeorge/ediscovery

def _process_doc(doc):
    '''Processes a single email file 
    
    Arguments: 
        doc - a Document in the Lucene index  
    '''
    tokens = []
    if doc is not None:
        all_text = doc.get(MetadataType.ALL)
        file_path = doc.get(MetadataType.FILE_PATH)
        if all_text is None:
            # file_name = doc.get(MetadataType.FILE_NAME)
            logging.error('%s does not have any contents.', file_path)
            tokens = []
        else:
            tokens = whitespace_tokenize(all_text) # regex_tokenizer(all_text)# 
        
    return tokens

示例#9

0

显示文件

文件： process_query.py 项目： clintpgeorge/ediscovery

def search_lda_model(query_text, lda_dictionary, lda_mdl, lda_index, lda_file_path_index, limit):
    '''Tokenize the input query and finds topically 
    similar documents (responsive) using the LDA based 
    document search. 
    
    Returns:
        responsive_docs - [doc_id, doc_dir_path, doc_name, score] 
    Arguments:
        query_text - the query in text format 
        lda_dictionary - the dictionary object 
        lda_mdl - the LDA model object 
        lda_index - the index object 
        lda_file_path_index - the list of file details 
        limit - the limit on the number of responsive records 
    
    '''
    
    # process the query 
    
    query_vec = lda_dictionary.doc2bow(whitespace_tokenize(query_text))
    
    if len(query_vec) == 0: 
        logging.exception('Query words are not in the dictionary. Exiting topic search!')
        return [] 
    else: 
        logging.info('%d query words are in the dictionary.', len(query_vec))
    
    query_td = lda_mdl[query_vec]
    
    # querying based on cosine distance
    
    sims = lda_index[query_td] # perform a similarity query against the corpus
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    ## Identifies responsive documents
     
    responsive_docs_idx = sims[0:limit]
    responsive_docs = [] 
    for (doc_id, score) in responsive_docs_idx: 
        doc = list(lda_file_path_index[doc_id]) # i.e., [doc_id, doc_dir_path, doc_name]
        doc.append(score)
        responsive_docs.append(doc)
    
    return responsive_docs