def process_msg(file_path_tuple):
    '''Processes a single email file 
    
    Arguments: 
        file_path_tuple - a tuple of (idx, root, file_name) 
    '''
    
    (idx, root, file_name)  = file_path_tuple
    logging.info('[#%d] file: %s' % (idx, os.path.join(root, file_name)) )
    
    
    _, _, _, _, body_text = parse_plain_text_email(os.path.join(root, file_name))
    tokens = punkt_word_tokenizer(body_text.lower())
    
    return tokens
Пример #2
0
def index_plain_text_emails(data_folder, 
                            path_index_file, store_dir, 
                            lemmatize = False, stem = False, 
                            nonascii = True):
    '''
    Indexes all the plain text emails in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory absolute path 
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): 
        os.mkdir(store_dir)
    
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
    
        logging.info('%d email documents found.' % len(file_tuples))
    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)
    
    logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples)))
        
    store = SimpleFSDirectory(File(store_dir))
    writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED)
    
    print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder
    print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii  
    
    for ft in file_tuples: 
        idx, root, file_name = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name)
        # parses the emails in plain text format 
        receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, 
                                                                                                    tokenize = True, 
                                                                                                    lemmatize = lemmatize, 
                                                                                                    stem = stem, 
                                                                                                    nonascii = nonascii)

        doc = Document()
        doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES))
        doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added BCC field in indexing.
        doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added Email-Date field in indexing
        doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        
        if len(message_text) > 0:
            doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        else:
            logging.error("[%d] file: %s - body text is empty.", idx, file_name)
            
        # Adds all documents fields as a separate index so that we can search through them 
        doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))

        writer.addDocument(doc)
        logging.info("[%d] file: %s - added to Lucene index.", idx, file_name)


    writer.commit()
    writer.close()

    logging.info('Lucene: All files are indexed.')
Пример #3
0
def index_plain_text_emails(data_folder, path_index_file, store_dir, 
                            lemmatize=False, stem=False, nonascii=True):
    '''
    Indexes all the plain text emails in the input directory and stores the 
    index in the store_dir  
    
    Arguments: 
        data_folder - input directory (absolute path)
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
        lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer 
        stem - stem tokens 
        nonascii - allow non-ASCII characters  
         
        
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): os.mkdir(store_dir)
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('Index file path: %s' % path_index_file)

    schema = Schema(file_id=NUMERIC(int, stored=True), 
                    file_name=ID(stored=True), 
                    file_path=ID(stored=True), 
                    email_reciever=TEXT(stored=True), 
                    email_sender=TEXT(stored=True), 
                    email_cc=TEXT(stored=True), 
                    email_subject=TEXT(stored=True), 
                    email_bcc=TEXT(stored=True),
                    date=ID(stored=True),
                    email_body=TEXT(stored=True),
                    all=TEXT(stored=True))
    ix = create_in(store_dir, schema)
    writer = ix.writer()
    logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' 
                 % (stem, lemmatize, len(file_tuples), nonascii))
    
    for ft in file_tuples: 
        idx, root, file_name, file_type = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] creating index for %s...", idx, file_name)
        
        
        ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, 
                                     nonascii=nonascii, file_type=file_type)

        (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret
        
        writer.add_document(file_id = idx, 
                            file_name = unicode(file_name), 
                            file_path = unicode(file_path), 
                            email_reciever = unicode(receiver), 
                            email_sender = unicode(sender), 
                            email_cc = unicode(cc),
                            email_subject = unicode(subject), 
                            email_bcc = unicode(bcc), 
                            date = unicode(date), 
                            email_body = unicode(body_text), 
                            all = unicode(doc_text))
 
    writer.commit()
    logging.info('All files are indexed.')