def test_message_cleanup(): assert cleanup_subject("Re: Birthday") == "Birthday" assert cleanup_subject("Re:Birthday") == "Birthday" assert cleanup_subject("Re:FWD: Birthday") == "Birthday" assert (cleanup_subject("Re: RE: Alors, comment ça s'est passé ?") == "Alors, comment ça s'est passé ?") assert cleanup_subject("Re: FWD:FWD: Re:La chaise") == "La chaise" assert cleanup_subject("Aw: über cool") == "über cool" assert cleanup_subject("Aw:Re:wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("Aw: wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("aw: wg:wienerschnitzel") == "wienerschnitzel"
def fetch_corresponding_thread(db_session, namespace_id, message): """fetch a thread matching the corresponding message. Returns None if there's no matching thread.""" # FIXME: for performance reasons, we make the assumption that a reply # to a message always has a similar subject. This is only # right 95% of the time. clean_subject = cleanup_subject(message.subject) threads = db_session.query(Thread).filter( Thread.namespace_id == namespace_id, Thread._cleaned_subject == clean_subject). \ order_by(desc(Thread.id)) for thread in safer_yield_per(threads, Thread.id, 0, 100): for match in thread.messages: # A lot of people BCC some address when sending mass # emails so ignore BCC. match_bcc = match.bcc_addr if match.bcc_addr else [] message_bcc = message.bcc_addr if message.bcc_addr else [] match_emails = [ t[1] for t in match.participants if t not in match_bcc ] message_emails = [ t[1] for t in message.participants if t not in message_bcc ] # A conversation takes place between two or more persons. # Are there more than two participants in common in this # thread? If yes, it's probably a related thread. match_participants_set = set(match_emails) message_participants_set = set(message_emails) if len(match_participants_set & message_participants_set) >= 2: # No need to loop through the rest of the messages # in the thread if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread # handle the case where someone is self-sending an email. if not message.from_addr or not message.to_addr: return match_from = [t[1] for t in match.from_addr] match_to = [t[1] for t in match.from_addr] message_from = [t[1] for t in message.from_addr] message_to = [t[1] for t in message.to_addr] if (len(message_to) == 1 and message_from == message_to and match_from == match_to and message_to == match_from): # Check that we're not over max thread length in this case # No need to loop through the rest of the messages # in the thread. if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread return
def fetch_corresponding_thread(db_session, namespace_id, message): """fetch a thread matching the corresponding message. Returns None if there's no matching thread.""" # FIXME: for performance reasons, we make the assumption that a reply # to a message always has a similar subject. This is only # right 95% of the time. clean_subject = cleanup_subject(message.subject) threads = db_session.query(Thread). \ filter(Thread.namespace_id == namespace_id, Thread._cleaned_subject == clean_subject). \ order_by(desc(Thread.id)). \ options(joinedload(Thread.messages).load_only( 'from_addr', 'to_addr', 'bcc_addr', 'cc_addr')) for thread in threads: for match in thread.messages: # A lot of people BCC some address when sending mass # emails so ignore BCC. match_bcc = match.bcc_addr if match.bcc_addr else [] message_bcc = message.bcc_addr if message.bcc_addr else [] match_emails = [t[1] for t in match.participants if t not in match_bcc] message_emails = [t[1] for t in message.participants if t not in message_bcc] # A conversation takes place between two or more persons. # Are there more than two participants in common in this # thread? If yes, it's probably a related thread. match_participants_set = set(match_emails) message_participants_set = set(message_emails) if len(match_participants_set & message_participants_set) >= 2: # No need to loop through the rest of the messages # in the thread if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread # handle the case where someone is self-sending an email. if not message.from_addr or not message.to_addr: return match_from = [t[1] for t in match.from_addr] match_to = [t[1] for t in match.from_addr] message_from = [t[1] for t in message.from_addr] message_to = [t[1] for t in message.to_addr] if (len(message_to) == 1 and message_from == message_to and match_from == match_to and message_to == match_from): # Check that we're not over max thread length in this case # No need to loop through the rest of the messages # in the thread. if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread return
def from_imap_message(cls, session, namespace, message): if message.thread is not None: # If this message *already* has a thread associated with it, don't # create a new one. return message.thread clean_subject = cleanup_subject(message.subject) thread = cls(subject=clean_subject, recentdate=message.received_date, namespace=namespace, subjectdate=message.received_date, snippet=message.snippet) return thread
def compute_cleaned_up_subject(self, key, value): self._cleaned_subject = cleanup_subject(value) return value
def test_message_cleanup(): assert cleanup_subject("Re: Birthday") == "Birthday" assert cleanup_subject("Re:Birthday") == "Birthday" assert cleanup_subject("Re:FWD: Birthday") == "Birthday" assert cleanup_subject( "RE:FWD: My\tBirthday\n Party") == "My Birthday Party" assert (cleanup_subject("Re: RE: Alors, comment ça s'est passé ?") == "Alors, comment ça s'est passé ?") assert cleanup_subject("Re: FWD:FWD: Re:La chaise") == "La chaise" assert cleanup_subject("Aw: über cool") == "über cool" assert cleanup_subject("Aw:Re:wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("Aw: wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("aw: wg:wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("Undeliverable: Message returned to sender" ) == "Message returned to sender" assert cleanup_subject("Undelivered: Message returned to sender" ) == "Message returned to sender"
def slurp_imap_namespace_gmail(imap, db, namespace=None, account=None): # folder attrs -> RFC 6154 Special-Use mailbox flags singleton_flags = { 'all_folder': u'\\All', 'archive_folder': u'\\Archive', 'drafts_folder': u'\\Drafts', 'starred_folder': u'\\Flagged', 'spam_folder': u'\\Junk', 'sent_folder': u'\\Sent', 'trash_folder': u'\\Trash', } # List folders -- Returns sequence of (flags, delimiter, name) folders_fdn = imap.list_folders() with db: # Folder names & delimiters db.executemany( """ INSERT INTO folders ( folder_name, clean_folder_name, imap_delimiter ) VALUES (?, ?, ?) """, ((name, cleanup_folder_name(name), delimiter) for flags, delimiter, name in folders_fdn)) # Folder flags db.executemany( """ INSERT INTO folder_flags (folder_name, flag) VALUES (?, ?) """, ((name, flag) for flags, delimiter, name in folders_fdn for flag in flags)) # Set imap_noselect = 1 on folders that have the \Noselect flag; # Set imap_noselect = 0 on folders that don't. db.execute(""" UPDATE folders SET imap_noselect = ( SELECT folder_flags.flag IS NOT NULL FROM folders AS a LEFT JOIN folder_flags ON ( a.folder_name = folder_flags.folder_name AND folder_flags.flag = '\Noselect' ) WHERE folders.folder_name = a.folder_name ) """) # Insert 'inbox_folder' -> 'INBOX' if there is an INBOX folder, which # there should always be, I think. db.execute( """ INSERT INTO special_folders (attr_name, folder_name) SELECT ?, folder_name FROM folders WHERE folder_name = ? """, ['inbox_folder', 'INBOX']) # Insert other special folder names db.executemany( """ INSERT INTO special_folders (attr_name, folder_name) SELECT ?, folder_name FROM folder_flags WHERE flag = ? """, singleton_flags.items()) # Fetch all messages from each folder with db: folder_names = [ row[0] for row in db.execute( "SELECT folder_name FROM folders WHERE NOT imap_noselect") ] for folder_name in folder_names: # EXAMINE the folder examine_response = imap.select_folder(folder_name, readonly=True) # Update imap_uidvalidity db.execute( """ UPDATE folders SET imap_uidvalidity = ?, imap_uidnext = ? WHERE folder_name = ? """, [ examine_response[u'UIDVALIDITY'], examine_response[u'UIDNEXT'], folder_name ]) # Get uids of the messages in the folder imap_uids = imap.search(u'ALL') # Result should match the stated number of messages in the folder. if len(imap_uids) != examine_response[u'EXISTS']: raise AssertionError("len(imap_uids)={0}, EXISTS={1!r}".format( len(imap_uids), examine_response[u'EXISTS'])) # Create folder_messages entries db.executemany( """ INSERT INTO folder_messages (folder_name, imap_uid) VALUES (?, ?) """, ((folder_name, imap_uid) for imap_uid in imap_uids)) ## Get the folder flags #folder_flags = set(row[0] for row in db.execute( # "SELECT flag FROM folder_flags WHERE folder_name = ?", # [folder_name])) # ## This is Gmail, so only actually fetch messages from the 'All ## Mail' and 'Trash' folders. This *should* give us all of the ## messages. #if not folder_flags & {u'\\All', u'\\Trash', u'\\Sent'}: # continue # Get folder messages batch_size = 1000 fetch_data = [ 'RFC822.SIZE', 'ENVELOPE', 'FLAGS', 'X-GM-MSGID', 'X-GM-THRID', 'X-GM-LABELS', 'INTERNALDATE', 'RFC822.HEADER' ] for i in range(0, len(imap_uids), batch_size): imap_uids_batch = imap_uids[i:i + batch_size] # Fetch message info from the IMAP server fetch_response = imap.fetch(imap_uids_batch, fetch_data) # Fetch message info and insert it into the messages table. # Don't bother deduplicating at this point. for uid, data in fetch_response.items(): headers = MimeHeaders.from_stream( StringIO(data['RFC822.HEADER'])) msg_data = dict( date=data['INTERNALDATE'], subject=data['ENVELOPE'].subject, in_reply_to=data['ENVELOPE'].in_reply_to, size=data['RFC822.SIZE'], message_id_header=data['ENVELOPE'].message_id, x_gm_thrid=unicode(data['X-GM-THRID']), x_gm_msgid=unicode(data['X-GM-MSGID']), sender_addr=json.dumps( parse_email_address_list(headers.get('Sender'))), from_addr=json.dumps( parse_email_address_list(headers.get('From'))), reply_to_addr=json.dumps( parse_email_address_list(headers.get('Reply-To'))), to_addr=json.dumps( parse_email_address_list(headers.get('To'))), cc_addr=json.dumps( parse_email_address_list(headers.get('Cc'))), bcc_addr=json.dumps( parse_email_address_list(headers.get('Bcc'))), ) msg_data['clean_subject'] = \ cleanup_subject(parse_header_value('Subject', msg_data['subject'])) # Check if we've already stored the message cur = db.execute( """ SELECT id, x_gm_msgid FROM messages WHERE x_gm_msgid = :x_gm_msgid """, msg_data) row = next(iter(cur.fetchall()), None) # returns 0 or 1 rows message_id = row['id'] if row is not None else None # If we've never stored the message, store it now. if message_id is None: cur = db.execute( """ INSERT INTO messages ( date, subject, clean_subject, in_reply_to, size, message_id_header, x_gm_msgid, x_gm_thrid, sender_addr, from_addr, reply_to_addr, to_addr, cc_addr, bcc_addr ) VALUES ( :date, :subject, :clean_subject, :in_reply_to, :size, :message_id_header, :x_gm_msgid, :x_gm_thrid, :sender_addr, :from_addr, :reply_to_addr, :to_addr, :cc_addr, :bcc_addr ) """, msg_data) message_id = cur.lastrowid # Store the Gmail labels (these can be different in # different folders; e.g. messages in the 'Sent' folder are # missing the u'\\Sent' label) db.executemany( """ INSERT INTO folder_message_gm_labels (folder_name, message_id, label) VALUES (?, ?, ?) """, ((folder_name, message_id, label) for label in data['X-GM-LABELS'])) # Mark the message as being in the current folder. db.execute( """ UPDATE folder_messages SET message_id = ? WHERE folder_name = ? AND imap_uid = ? """, (message_id, folder_name, uid)) # Construct threads (assuming gmail for now) db.execute(""" INSERT INTO threads (x_gm_thrid) SELECT DISTINCT x_gm_thrid FROM messages """) db.execute(""" INSERT INTO thread_messages (thread_id, message_id) SELECT threads.id, messages.id FROM threads, messages WHERE threads.x_gm_thrid = messages.x_gm_thrid """) # Construct folder_threads db.execute(""" INSERT INTO folder_threads (folder_name, thread_id) SELECT DISTINCT folder_messages.folder_name, thread_messages.thread_id FROM folder_messages LEFT JOIN thread_messages USING (message_id) """)
def slurp_imap_namespace_gmail(imap, db, namespace=None, account=None): # folder attrs -> RFC 6154 Special-Use mailbox flags singleton_flags = { 'all_folder': u'\\All', 'archive_folder': u'\\Archive', 'drafts_folder': u'\\Drafts', 'starred_folder': u'\\Flagged', 'spam_folder': u'\\Junk', 'sent_folder': u'\\Sent', 'trash_folder': u'\\Trash', } # List folders -- Returns sequence of (flags, delimiter, name) folders_fdn = imap.list_folders() with db: # Folder names & delimiters db.executemany(""" INSERT INTO folders ( folder_name, clean_folder_name, imap_delimiter ) VALUES (?, ?, ?) """, ((name, cleanup_folder_name(name), delimiter) for flags, delimiter, name in folders_fdn)) # Folder flags db.executemany(""" INSERT INTO folder_flags (folder_name, flag) VALUES (?, ?) """, ((name, flag) for flags, delimiter, name in folders_fdn for flag in flags)) # Set imap_noselect = 1 on folders that have the \Noselect flag; # Set imap_noselect = 0 on folders that don't. db.execute(""" UPDATE folders SET imap_noselect = ( SELECT folder_flags.flag IS NOT NULL FROM folders AS a LEFT JOIN folder_flags ON ( a.folder_name = folder_flags.folder_name AND folder_flags.flag = '\Noselect' ) WHERE folders.folder_name = a.folder_name ) """) # Insert 'inbox_folder' -> 'INBOX' if there is an INBOX folder, which # there should always be, I think. db.execute(""" INSERT INTO special_folders (attr_name, folder_name) SELECT ?, folder_name FROM folders WHERE folder_name = ? """, ['inbox_folder', 'INBOX']) # Insert other special folder names db.executemany(""" INSERT INTO special_folders (attr_name, folder_name) SELECT ?, folder_name FROM folder_flags WHERE flag = ? """, singleton_flags.items()) # Fetch all messages from each folder with db: folder_names = [row[0] for row in db.execute( "SELECT folder_name FROM folders WHERE NOT imap_noselect")] for folder_name in folder_names: # EXAMINE the folder examine_response = imap.select_folder(folder_name, readonly=True) # Update imap_uidvalidity db.execute(""" UPDATE folders SET imap_uidvalidity = ?, imap_uidnext = ? WHERE folder_name = ? """, [examine_response[u'UIDVALIDITY'], examine_response[u'UIDNEXT'], folder_name]) # Get uids of the messages in the folder imap_uids = imap.search(u'ALL') # Result should match the stated number of messages in the folder. if len(imap_uids) != examine_response[u'EXISTS']: raise AssertionError("len(imap_uids)={0}, EXISTS={1!r}".format( len(imap_uids), examine_response[u'EXISTS'])) # Create folder_messages entries db.executemany(""" INSERT INTO folder_messages (folder_name, imap_uid) VALUES (?, ?) """, ((folder_name, imap_uid) for imap_uid in imap_uids)) ## Get the folder flags #folder_flags = set(row[0] for row in db.execute( # "SELECT flag FROM folder_flags WHERE folder_name = ?", # [folder_name])) # ## This is Gmail, so only actually fetch messages from the 'All ## Mail' and 'Trash' folders. This *should* give us all of the ## messages. #if not folder_flags & {u'\\All', u'\\Trash', u'\\Sent'}: # continue # Get folder messages batch_size = 1000 fetch_data = ['RFC822.SIZE', 'ENVELOPE', 'FLAGS', 'X-GM-MSGID', 'X-GM-THRID', 'X-GM-LABELS', 'INTERNALDATE', 'RFC822.HEADER'] for i in range(0, len(imap_uids), batch_size): imap_uids_batch = imap_uids[i:i+batch_size] # Fetch message info from the IMAP server fetch_response = imap.fetch(imap_uids_batch, fetch_data) # Fetch message info and insert it into the messages table. # Don't bother deduplicating at this point. for uid, data in fetch_response.items(): headers = MimeHeaders.from_stream(StringIO(data['RFC822.HEADER'])) msg_data = dict( date=data['INTERNALDATE'], subject=data['ENVELOPE'].subject, in_reply_to=data['ENVELOPE'].in_reply_to, size=data['RFC822.SIZE'], message_id_header=data['ENVELOPE'].message_id, x_gm_thrid=unicode(data['X-GM-THRID']), x_gm_msgid=unicode(data['X-GM-MSGID']), sender_addr=json.dumps(parse_email_address_list(headers.get('Sender'))), from_addr=json.dumps(parse_email_address_list(headers.get('From'))), reply_to_addr=json.dumps(parse_email_address_list(headers.get('Reply-To'))), to_addr=json.dumps(parse_email_address_list(headers.get('To'))), cc_addr=json.dumps(parse_email_address_list(headers.get('Cc'))), bcc_addr=json.dumps(parse_email_address_list(headers.get('Bcc'))), ) msg_data['clean_subject'] = \ cleanup_subject(parse_header_value('Subject', msg_data['subject'])) # Check if we've already stored the message cur = db.execute(""" SELECT id, x_gm_msgid FROM messages WHERE x_gm_msgid = :x_gm_msgid """, msg_data) row = next(iter(cur.fetchall()), None) # returns 0 or 1 rows message_id = row['id'] if row is not None else None # If we've never stored the message, store it now. if message_id is None: cur = db.execute(""" INSERT INTO messages ( date, subject, clean_subject, in_reply_to, size, message_id_header, x_gm_msgid, x_gm_thrid, sender_addr, from_addr, reply_to_addr, to_addr, cc_addr, bcc_addr ) VALUES ( :date, :subject, :clean_subject, :in_reply_to, :size, :message_id_header, :x_gm_msgid, :x_gm_thrid, :sender_addr, :from_addr, :reply_to_addr, :to_addr, :cc_addr, :bcc_addr ) """, msg_data) message_id = cur.lastrowid # Store the Gmail labels (these can be different in # different folders; e.g. messages in the 'Sent' folder are # missing the u'\\Sent' label) db.executemany(""" INSERT INTO folder_message_gm_labels (folder_name, message_id, label) VALUES (?, ?, ?) """, ((folder_name, message_id, label) for label in data['X-GM-LABELS'])) # Mark the message as being in the current folder. db.execute(""" UPDATE folder_messages SET message_id = ? WHERE folder_name = ? AND imap_uid = ? """, (message_id, folder_name, uid)) # Construct threads (assuming gmail for now) db.execute(""" INSERT INTO threads (x_gm_thrid) SELECT DISTINCT x_gm_thrid FROM messages """) db.execute(""" INSERT INTO thread_messages (thread_id, message_id) SELECT threads.id, messages.id FROM threads, messages WHERE threads.x_gm_thrid = messages.x_gm_thrid """) # Construct folder_threads db.execute(""" INSERT INTO folder_threads (folder_name, thread_id) SELECT DISTINCT folder_messages.folder_name, thread_messages.thread_id FROM folder_messages LEFT JOIN thread_messages USING (message_id) """)
def test_message_cleanup(): assert cleanup_subject("Re: Birthday") == "Birthday" assert cleanup_subject("Re:Birthday") == "Birthday" assert cleanup_subject("Re:FWD: Birthday") == "Birthday" assert cleanup_subject("RE:FWD: My\tBirthday\n Party") == "My Birthday Party" assert (cleanup_subject("Re: RE: Alors, comment ça s'est passé ?") == "Alors, comment ça s'est passé ?") assert cleanup_subject("Re: FWD:FWD: Re:La chaise") == "La chaise" assert cleanup_subject("Aw: über cool") == "über cool" assert cleanup_subject("Aw:Re:wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("Aw: wienerschnitzel") == "wienerschnitzel" assert cleanup_subject("aw: wg:wienerschnitzel") == "wienerschnitzel" assert cleanup_subject( "Undeliverable: Message returned to sender") == "Message returned to sender" assert cleanup_subject( "Undelivered: Message returned to sender") == "Message returned to sender"
def fetch_corresponding_thread(db_session, namespace_id, message): """fetch a thread matching the corresponding message. Returns None if there's no matching thread.""" # handle the case where someone is self-sending an email. if not message.from_addr or not message.to_addr: return None message_from = [t[1] for t in message.from_addr] message_to = [t[1] for t in message.to_addr] # FIXME: for performance reasons, we make the assumption that a reply # to a message always has a similar subject. This is only # right 95% of the time. clean_subject = cleanup_subject(message.subject) # XXX: It is much faster to sort client-side by message date. We therefore # use `contains_eager` and `outerjoin` to fetch the messages by thread in # no particular order (as opposed to `joinedload`, which would use the # order_by on the Message._thread backref). We also use a limit to avoid # scanning too many / large threads. threads = (db_session.query(Thread).filter( Thread.namespace_id == namespace_id, Thread._cleaned_subject == clean_subject, ).outerjoin(Message, Thread.messages).order_by(desc(Thread.id)).options( load_only("id", "discriminator"), contains_eager(Thread.messages).load_only("from_addr", "to_addr", "bcc_addr", "cc_addr", "received_date"), ).limit(MAX_MESSAGES_SCANNED)) for thread in threads: messages = sorted(thread.messages, key=attrgetter("received_date")) for match in messages: # A lot of people BCC some address when sending mass # emails so ignore BCC. match_bcc = match.bcc_addr if match.bcc_addr else [] message_bcc = message.bcc_addr if message.bcc_addr else [] match_emails = set([ t[1].lower() for t in match.participants if t not in match_bcc ]) message_emails = set([ t[1].lower() for t in message.participants if t not in message_bcc ]) # A conversation takes place between two or more persons. # Are there more than two participants in common in this # thread? If yes, it's probably a related thread. if len(match_emails & message_emails) >= 2: # No need to loop through the rest of the messages # in the thread if len(messages) >= MAX_THREAD_LENGTH: break else: return match.thread match_from = [t[1] for t in match.from_addr] match_to = [t[1] for t in match.from_addr] if (len(message_to) == 1 and message_from == message_to and match_from == match_to and message_to == match_from): # Check that we're not over max thread length in this case # No need to loop through the rest of the messages # in the thread. if len(messages) >= MAX_THREAD_LENGTH: break else: return match.thread return None