class MailIndex: """This is a lazily parsing object representing a mailpile index.""" MSG_MID = 0 MSG_PTRS = 1 MSG_ID = 2 MSG_DATE = 3 MSG_FROM = 4 MSG_TO = 5 MSG_SUBJECT = 6 MSG_SNIPPET = 7 MSG_TAGS = 8 MSG_REPLIES = 9 MSG_CONV_MID = 10 def __init__(self, config): self.config = config self.STATS = {} self.INDEX = [] self.INDEX_SORT = {} self.INDEX_CONV = [] self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} self.CACHE = {} self.MODIFIED = set() self.EMAILS_SAVED = 0 def l2m(self, line): return line.decode('utf-8').split(u'\t') # A translation table for message parts stored in the index, consists of # a mapping from unicode ordinals to either another unicode ordinal or # None, to remove a character. By default it removes the ASCII control # characters and replaces tabs and newlines with spaces. NORM_TABLE = dict( [(i, None) for i in range(0, 0x20)], **{ ord(u'\t'): ord(u' '), ord(u'\r'): ord(u' '), ord(u'\n'): ord(u' '), 0x7F: None }) def m2l(self, message): # Normalize the message before saving it so we can be sure that we will # be able to read it back later. parts = [unicode(p).translate(self.NORM_TABLE) for p in message] return (u'\t'.join(parts)).encode('utf-8') def load(self, session=None): self.INDEX = [] self.CACHE = {} self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} CachedSearchResultSet.DropCaches() def process_line(line): try: line = line.strip() if line.startswith('#'): pass elif line.startswith('@'): pos, email = line[1:].split('\t', 1) pos = int(pos, 36) while len(self.EMAILS) < pos + 1: self.EMAILS.append('') self.EMAILS[pos] = unquote(email) self.EMAIL_IDS[unquote(email).lower()] = pos elif line: words = line.split('\t') # FIXME: Delete this old crap. if len(words) == 10: # This is an old index file, reorder it! pos, p, unused, msgid, d, f, s, t, r, c = words ptrs = ','.join(['0' + ptr for ptr in p.split(',')]) line = '\t'.join( [pos, ptrs, msgid, d, f, '', s, '', t, r, c]) else: pos, ptrs, msgid = words[:3] pos = int(pos, 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[msgid] = pos for msg_ptr in ptrs.split(','): self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark('Loading metadata index...') try: fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning(('Metadata index not found: %s') % self.config.mailindex_file()) self.cache_sort_orders(session) if session: session.ui.mark('Loaded metadata, %d messages' % len(self.INDEX)) self.EMAILS_SAVED = len(self.EMAILS) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods or len(self.EMAILS) > self.EMAILS_SAVED: if session: session.ui.mark("Saving metadata index changes...") fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'a') for eid in range(self.EMAILS_SAVED, len(self.EMAILS)): fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid]))) for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index changes") self.EMAILS_SAVED = len(self.EMAILS) def save(self, session=None): self.MODIFIED = set() if session: session.ui.mark("Saving metadata index...") fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for eid in range(0, len(self.EMAILS)): fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid]))) for item in self.INDEX: fd.write(item + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index") def update_ptrs_and_msgids(self, session): session.ui.mark('Updating high level indexes') for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) > self.MSG_CONV_MID: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning('Bogus line: %s' % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: # Security: RFC822 headers are not allowed to have (unencoded) # non-ascii characters in them, so we just strip them all out # before parsing. # FIXME: This is "safe", but can we be smarter/gentler? value = CleanText(msg[name], replace='_').clean # Note: decode_header does the wrong thing with "quoted" data. decoded = email.header.decode_header((value or '').replace('"', '')) return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx_pos, msg_ptr): msg_info = self.get_msg_at_idx_pos(msg_idx_pos) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx_pos # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if (msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]): msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_at_idx_pos(msg_idx_pos, msg_info) def _parse_date(self, date_hdr): """Parse a Date: or Received: header into a unix timestamp.""" try: if ';' in date_hdr: date_hdr = date_hdr.split(';')[-1].strip() msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr))) if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1): return None else: return msg_ts except (ValueError, TypeError, OverflowError): return None def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date): """Extract a date, sanity checking against the Received: headers.""" hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or []) dates = [self._parse_date(date_hdr) for date_hdr in hdrs] msg_ts = dates[0] nz_dates = sorted([d for d in dates if d]) if nz_dates: median = nz_dates[len(nz_dates) / 2] if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600: return msg_ts else: session.ui.warning( ('=%s/%s using Recieved: instead of Date:') % (msg_mid, msg_id)) return median else: # If the above fails, we assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message if date parsing fails for some reason. session.ui.warning('=%s/%s has a bogus date' % (msg_mid, msg_id)) return last_date + 1 def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, mailbox_idx) if mbox.editable: session.ui.mark('%s: Skipped: %s' % (mailbox_idx, mailbox_fn)) return 0 else: session.ui.mark('%s: Checking: %s' % (mailbox_idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark( ('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) snippet_max = session.config.sys.snippet_max added = 0 msg_ts = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % ( mailbox_idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(mailbox_idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) continue else: session.ui.mark(parse_status) # Message new or modified, let's parse it. msg = ParseMessage(mbox.get_file(i), pgpmime=False) msg_id = b64c( sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip())) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg, msg_ts) keywords, snippet = self.index_message( session, msg_mid, msg_id, msg, msg_ts, mailbox=mailbox_idx, compact=False, filter_hooks=[self.filter_keywords]) msg_subject = self.hdr(msg, 'subject') msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))] tags = [ k.split(':')[0] for k in keywords if k.endswith(':tag') ] msg_to = (ExtractEmails(self.hdr(msg, 'to')) + ExtractEmails(self.hdr(msg, 'cc')) + ExtractEmails(self.hdr(msg, 'bcc'))) msg_idx_pos, msg_info = self.add_new_msg( msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to, msg_subject, msg_snippet, tags) self.set_conversation_ids(msg_info[self.MSG_MID], msg) mbox.mark_parsed(i) added += 1 if (added % 1000) == 0: GlobalPostingList.Optimize(session, self, quick=True) if added: mbox.save(session) session.ui.mark('%s: Indexed mailbox: %s' % (mailbox_idx, mailbox_fn)) return added
class MailIndex(object): """This is a lazily parsing object representing a mailpile index.""" MSG_IDX = 0 MSG_PTRS = 1 MSG_UNUSED = 2 # Was size, now reserved for other fun things MSG_ID = 3 MSG_DATE = 4 MSG_FROM = 5 MSG_SUBJECT = 6 MSG_TAGS = 7 MSG_REPLIES = 8 MSG_CONV_ID = 9 def __init__(self, config): self.config = config self.STATS = {} self.INDEX = [] self.PTRS = {} self.MSGIDS = {} self.CACHE = {} self.MODIFIED = set() def l2m(self, line): return line.decode('utf-8').split(u'\t') def m2l(self, message): return (u'\t'.join([unicode(p) for p in message])).encode('utf-8') def load(self, session=None): self.INDEX = [] self.PTRS = {} self.MSGIDS = {} def process_line(line): try: line = line.strip() if line and not line.startswith('#'): pos, ptrs, junk, msgid, rest = line.split('\t', 4) pos = int(pos, 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[msgid] = pos for msg_ptr in ptrs: self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark('Loading metadata index...') try: fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning(('Metadata index not found: %s') % self.config.mailindex_file()) if session: session.ui.mark('Loaded metadata for %d messages' % len(self.INDEX)) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods: if session: session.ui.mark("Saving metadata index changes...") fd = gpg_open(self.config.mailindex_file(), self.config.get('gpg_recipient'), 'a') for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index changes") def save(self, session=None): self.MODIFIED = set() if session: session.ui.mark("Saving metadata index...") fd = gpg_open(self.config.mailindex_file(), self.config.get('gpg_recipient'), 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for item in self.INDEX: fd.write(item + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index") def update_ptrs_and_msgids(self, session): session.ui.mark('Updating high level indexes') for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) > self.MSG_CONV_ID: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning('Bogus line: %s' % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: value = msg[name] decoded = email.header.decode_header(value or '') return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx, msg_ptr): msg_info = self.get_msg_by_idx(msg_idx) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if (msg_ptrs[i][:3] == msg_ptr[:3]): msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_by_idx(msg_idx, msg_info) def scan_mailbox(self, session, idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, idx) if mbox.editable: session.ui.mark('%s: Skipped: %s' % (idx, mailbox_fn)) return 0 else: session.ui.mark('%s: Checking: %s' % (idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark('%s: Error opening: %s (%s)' % (idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) added = 0 msg_date = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % ( idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) continue else: session.ui.mark(parse_status) # Message new or modified, let's parse it. msg = ParseMessage(mbox.get_file(i), pgpmime=False) msg_id = b64c( sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip())) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) try: last_date = msg_date msg_date = int( rfc822.mktime_tz( rfc822.parsedate_tz(self.hdr(msg, 'date')))) if msg_date > (time.time() + 24 * 3600): session.ui.warning('=%s/%s is from the FUTURE!' % (msg_mid, msg_id)) # Messages from the future are treated as today's msg_date = last_date + 1 elif msg_date < 1: session.ui.warning('=%s/%s is PREHISTORIC!' % (msg_mid, msg_id)) msg_date = last_date + 1 except (ValueError, TypeError, OverflowError): session.ui.warning('=%s/%s has a bogus date.' % (msg_mid, msg_id)) #if session.config.get('debug'): # session.ui.say(traceback.format_exc()) # This is a hack: We assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message. This should be a better-than-nothing guess. msg_date += 1 keywords = self.index_message( session, msg_mid, msg_id, msg, msg_date, mailbox=idx, compact=False, filter_hooks=[self.filter_keywords]) tags = [ k.split(':')[0] for k in keywords if k.endswith(':tag') ] msg_idx = len(self.INDEX) self.set_msg_by_idx( msg_idx, [ msg_mid, # Our index ID msg_ptr, # Location on disk '', # UNUSED msg_id, # Message-ID b36(msg_date), # Date as a UTC timestamp self.hdr(msg, 'from'), # From: self.hdr(msg, 'subject'), # Subject ','.join(tags), # Initial tags '', # No replies for now '' ]) # Conversation ID self.set_conversation_ids(msg_mid, msg) added += 1 if (added % 1000) == 0: GlobalPostingList.Optimize(session, self, quick=True) if added: mbox.mark_parsed(i) mbox.save(session) session.ui.mark('%s: Indexed mailbox: %s' % (idx, mailbox_fn)) return added
class MailIndex(object): """This is a lazily parsing object representing a mailpile index.""" MSG_IDX = 0 MSG_PTRS = 1 MSG_UNUSED = 2 # Was size, now reserved for other fun things MSG_ID = 3 MSG_DATE = 4 MSG_FROM = 5 MSG_SUBJECT = 6 MSG_TAGS = 7 MSG_REPLIES = 8 MSG_CONV_ID = 9 def __init__(self, config): self.config = config self.STATS = {} self.INDEX = [] self.PTRS = {} self.MSGIDS = {} self.CACHE = {} self.MODIFIED = set() def l2m(self, line): return line.decode('utf-8').split(u'\t') # A translation table for message parts stored in the index, consists of # a mapping from unicode ordinals to either another unicode ordinal or # None, to remove a character. By default it removes the ASCII control # characters and replaces tabs and newlines with spaces. NORM_TABLE = dict( [(i, None) for i in range(0, 0x20)], **{ ord(u'\t'): ord(u' '), ord(u'\r'): ord(u' '), ord(u'\n'): ord(u' '), 0x7F: None }) def m2l(self, message): # Normalize the message before saving it so we can be sure that we will be able to # read it back later. parts = [unicode(p).translate(MailIndex.NORM_TABLE) for p in message] return (u'\t'.join(parts)).encode('utf-8') def load(self, session=None): self.INDEX = [] self.PTRS = {} self.MSGIDS = {} def process_line(line): try: line = line.strip() if line and not line.startswith('#'): pos, ptrs, junk, msgid, rest = line.split('\t', 4) pos = int(pos, 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[msgid] = pos for msg_ptr in ptrs: self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark('Loading metadata index...') try: fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning(('Metadata index not found: %s') % self.config.mailindex_file()) if session: session.ui.mark('Loaded metadata for %d messages' % len(self.INDEX)) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods: if session: session.ui.mark("Saving metadata index changes...") fd = gpg_open(self.config.mailindex_file(), self.config.get('gpg_recipient'), 'a') for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index changes") def save(self, session=None): self.MODIFIED = set() if session: session.ui.mark("Saving metadata index...") fd = gpg_open(self.config.mailindex_file(), self.config.get('gpg_recipient'), 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for item in self.INDEX: fd.write(item + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index") def update_ptrs_and_msgids(self, session): session.ui.mark('Updating high level indexes') for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) > self.MSG_CONV_ID: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning('Bogus line: %s' % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: value = msg[name] decoded = email.header.decode_header(value or '') return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx, msg_ptr): msg_info = self.get_msg_by_idx(msg_idx) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if (msg_ptrs[i][:3] == msg_ptr[:3]): msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_by_idx(msg_idx, msg_info) def _parse_date(self, date_hdr): """Parse a Date: or Received: header into a unix timestamp.""" try: if ';' in date_hdr: date_hdr = date_hdr.split(';')[-1].strip() msg_date = int(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr))) if (msg_date > (time.time() + 24 * 3600)) or (msg_date < 1): return None else: return msg_date except (ValueError, TypeError, OverflowError): return None def _extract_date(self, session, msg_mid, msg_id, msg, last_date): """Extract a date, sanity checking against the Received: headers.""" hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or []) dates = [self._parse_date(date_hdr) for date_hdr in hdrs] msg_date = dates[0] nz_dates = sorted([d for d in dates if d]) if nz_dates: median = nz_dates[len(nz_dates) / 2] if msg_date and abs(msg_date - median) < 7 * 24 * 3600: return msg_date else: session.ui.warning( ('=%s/%s using Recieved: instead of Date:') % (msg_mid, msg_id)) return median else: # If the above fails, we assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message if date parsing fails for some reason. session.ui.warning('=%s/%s has a bogus date' % (msg_mid, msg_id)) return last_date + 1 def scan_mailbox(self, session, idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, idx) if mbox.editable: session.ui.mark('%s: Skipped: %s' % (idx, mailbox_fn)) return 0 else: session.ui.mark('%s: Checking: %s' % (idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark('%s: Error opening: %s (%s)' % (idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) added = 0 msg_date = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % ( idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) continue else: session.ui.mark(parse_status) # Message new or modified, let's parse it. msg = ParseMessage(mbox.get_file(i), pgpmime=False) msg_id = b64c( sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip())) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) msg_date = self._extract_date(session, msg_mid, msg_id, msg, msg_date) keywords = self.index_message( session, msg_mid, msg_id, msg, msg_date, mailbox=idx, compact=False, filter_hooks=[self.filter_keywords]) tags = [ k.split(':')[0] for k in keywords if k.endswith(':tag') ] msg_idx = len(self.INDEX) self.set_msg_by_idx( msg_idx, [ msg_mid, # Our index ID msg_ptr, # Location on disk '', # UNUSED msg_id, # Message-ID b36(msg_date), # Date as a UTC timestamp self.hdr(msg, 'from'), # From: self.hdr(msg, 'subject'), # Subject ','.join(tags), # Initial tags '', # No replies for now '' ]) # Conversation ID self.set_conversation_ids(msg_mid, msg) added += 1 if (added % 1000) == 0: GlobalPostingList.Optimize(session, self, quick=True) if added: mbox.mark_parsed(i) mbox.save(session) session.ui.mark('%s: Indexed mailbox: %s' % (idx, mailbox_fn)) return added
def scan_mailbox(self, session, idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, idx) if mbox.editable: session.ui.mark('%s: Skipped: %s' % (idx, mailbox_fn)) return 0 else: session.ui.mark('%s: Checking: %s' % (idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError): session.ui.mark('%s: Error opening: %s' % (idx, mailbox_fn)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) added = 0 msg_date = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % ( idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) continue else: session.ui.mark(parse_status) # Message new or modified, let's parse it. msg = ParseMessage(mbox.get_file(i), pgpmime=False) msg_id = b64c( sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip())) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) try: last_date = msg_date msg_date = int( rfc822.mktime_tz( rfc822.parsedate_tz(self.hdr(msg, 'date')))) if msg_date > (time.time() + 24 * 3600): session.ui.warning('=%s/%s is from the FUTURE!' % (msg_mid, msg_id)) # Messages from the future are treated as today's msg_date = last_date + 1 elif msg_date < 1: session.ui.warning('=%s/%s is PREHISTORIC!' % (msg_mid, msg_id)) msg_date = last_date + 1 except (ValueError, TypeError, OverflowError): session.ui.warning('=%s/%s has a bogus date.' % (msg_mid, msg_id)) if session.config.get('debug'): session.ui.say(traceback.format_exc()) # This is a hack: We assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message. This should be a better-than-nothing guess. msg_date += 1 keywords = self.index_message( session, msg_mid, msg_id, msg, msg_date, mailbox=idx, compact=False, filter_hooks=[self.filter_keywords]) tags = [ k.split(':')[0] for k in keywords if k.endswith(':tag') ] msg_idx = len(self.INDEX) self.set_msg_by_idx( msg_idx, [ msg_mid, # Our index ID msg_ptr, # Location on disk '', # UNUSED msg_id, # Message-ID b36(msg_date), # Date as a UTC timestamp self.hdr(msg, 'from'), # From: self.hdr(msg, 'subject'), # Subject ','.join(tags), # Initial tags '', # No replies for now '' ]) # Conversation ID self.set_conversation_ids(msg_mid, msg) added += 1 if (added % 1000) == 0: GlobalPostingList.Optimize(session, self, quick=True) if added: mbox.mark_parsed(i) mbox.save(session) session.ui.mark('%s: Indexed mailbox: %s' % (idx, mailbox_fn)) return added
class MailIndex: """This is a lazily parsing object representing a mailpile index.""" MSG_MID = 0 MSG_PTRS = 1 MSG_ID = 2 MSG_DATE = 3 MSG_FROM = 4 MSG_TO = 5 MSG_CC = 6 MSG_KB = 7 MSG_SUBJECT = 8 MSG_BODY = 9 MSG_TAGS = 10 MSG_REPLIES = 11 MSG_THREAD_MID = 12 MSG_FIELDS_V1 = 11 MSG_FIELDS_V2 = 13 BOGUS_METADATA = [ None, '', None, '0', '(no sender)', '', '', '0', '(not in index)', '', '', '', '-1' ] MAX_INCREMENTAL_SAVES = 25 def __init__(self, config): self.config = config self.INDEX = [] self.INDEX_SORT = {} self.INDEX_THR = [] self.PTRS = {} self.TAGS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} self.CACHE = {} self.MODIFIED = set() self.EMAILS_SAVED = 0 self._saved_changes = 0 self._lock = threading.Lock() def l2m(self, line): return line.decode('utf-8').split(u'\t') # A translation table for message parts stored in the index, consists of # a mapping from unicode ordinals to either another unicode ordinal or # None, to remove a character. By default it removes the ASCII control # characters and replaces tabs and newlines with spaces. NORM_TABLE = dict( [(i, None) for i in range(0, 0x20)], **{ ord(u'\t'): ord(u' '), ord(u'\r'): ord(u' '), ord(u'\n'): ord(u' '), 0x7F: None }) def m2l(self, message): # Normalize the message before saving it so we can be sure that we will # be able to read it back later. parts = [unicode(p).translate(self.NORM_TABLE) for p in message] return (u'\t'.join(parts)).encode('utf-8') def load(self, session=None): self.INDEX = [] self.CACHE = {} self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} CachedSearchResultSet.DropCaches() def process_line(line): try: line = line.strip() if line.startswith('#'): pass elif line.startswith('@'): pos, email = line[1:].split('\t', 1) pos = int(pos, 36) while len(self.EMAILS) < pos + 1: self.EMAILS.append('') unquoted_email = unquote(email).decode('utf-8') self.EMAILS[pos] = unquoted_email self.EMAIL_IDS[unquoted_email.split()[0].lower()] = pos elif line: words = line.split('\t') # Migration: converting old metadata into new! if len(words) != self.MSG_FIELDS_V2: # V1 -> V2 adds MSG_CC and MSG_KB if len(words) == self.MSG_FIELDS_V1: words[self.MSG_CC:self.MSG_CC] = [''] words[self.MSG_KB:self.MSG_KB] = ['0'] # Add V2 -> V3 here, etc. etc. if len(words) == self.MSG_FIELDS_V2: line = '\t'.join(words) else: raise Exception( _('Your metadata index is either ' 'too old, too new or corrupt!')) pos = int(words[self.MSG_MID], 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[words[self.MSG_ID]] = pos self.update_msg_tags(pos, words) for msg_ptr in words[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark(_('Loading metadata index...')) try: self._lock.acquire() fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning( _('Metadata index not found: %s') % self.config.mailindex_file()) finally: self._lock.release() self.cache_sort_orders(session) if session: session.ui.mark( _('Loaded metadata, %d messages') % len(self.INDEX)) self.EMAILS_SAVED = len(self.EMAILS) def update_msg_tags(self, msg_idx_pos, msg_info): tags = set([t for t in msg_info[self.MSG_TAGS].split(',') if t]) for tid in (set(self.TAGS.keys()) - tags): self.TAGS[tid] -= set([msg_idx_pos]) for tid in tags: if tid not in self.TAGS: self.TAGS[tid] = set() self.TAGS[tid].add(msg_idx_pos) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods or len(self.EMAILS) > self.EMAILS_SAVED: if self._saved_changes >= self.MAX_INCREMENTAL_SAVES: return self.save(session=session) try: self._lock.acquire() if session: session.ui.mark(_("Saving metadata index changes...")) fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'a') for eid in range(self.EMAILS_SAVED, len(self.EMAILS)): quoted_email = quote(self.EMAILS[eid].encode('utf-8')) fd.write('@%s\t%s\n' % (b36(eid), quoted_email)) for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark(_("Saved metadata index changes")) self.EMAILS_SAVED = len(self.EMAILS) self._saved_changes += 1 finally: self._lock.release() def save(self, session=None): try: self._lock.acquire() self.MODIFIED = set() if session: session.ui.mark(_("Saving metadata index...")) idxfile = self.config.mailindex_file() newfile = '%s.new' % idxfile fd = gpg_open(newfile, self.config.prefs.gpg_recipient, 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for eid in range(0, len(self.EMAILS)): quoted_email = quote(self.EMAILS[eid].encode('utf-8')) fd.write('@%s\t%s\n' % (b36(eid), quoted_email)) for item in self.INDEX: fd.write(item + '\n') fd.close() # Keep the last 5 index files around... just in case. backup_file(idxfile, backups=5, min_age_delta=10) os.rename(newfile, idxfile) flush_append_cache() self._saved_changes = 0 if session: session.ui.mark(_("Saved metadata index")) finally: self._lock.release() def update_ptrs_and_msgids(self, session): session.ui.mark(_('Updating high level indexes')) for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) == self.MSG_FIELDS_V2: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning(_('Bogus line: %s') % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: # Security: RFC822 headers are not allowed to have (unencoded) # non-ascii characters in them, so we just strip them all out # before parsing. # FIXME: This is "safe", but can we be smarter/gentler? value = CleanText(msg[name], replace='_').clean # Note: decode_header does the wrong thing with "quoted" data. decoded = email.header.decode_header((value or '').replace('"', '')) return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx_pos, msg_ptr): msg_info = self.get_msg_at_idx_pos(msg_idx_pos) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx_pos # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]: msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_at_idx_pos(msg_idx_pos, msg_info) def _parse_date(self, date_hdr): """Parse a Date: or Received: header into a unix timestamp.""" try: if ';' in date_hdr: date_hdr = date_hdr.split(';')[-1].strip() msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr))) if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1): return None else: return msg_ts except (ValueError, TypeError, OverflowError): return None def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date): """Extract a date, sanity checking against the Received: headers.""" hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or []) dates = [self._parse_date(date_hdr) for date_hdr in hdrs] msg_ts = dates[0] nz_dates = sorted([d for d in dates if d]) if nz_dates: median = nz_dates[len(nz_dates) / 2] if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600: return msg_ts else: session.ui.warning( _('=%s/%s using Received: instead of Date:') % (msg_mid, msg_id)) return median else: # If the above fails, we assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message if date parsing fails for some reason. session.ui.warning( _('=%s/%s has a bogus date') % (msg_mid, msg_id)) return last_date + 1 def encode_msg_id(self, msg_id): return b64c(sha1b64(msg_id.strip())) def get_msg_id(self, msg, msg_ptr): raw_msg_id = self.hdr(msg, 'message-id') if not raw_msg_id: # Create a very long pseudo-msgid for messages without a # Message-ID. This was a very badly behaved mailer, so if # we create duplicates this way, we are probably only # losing spam. Even then the Received line should save us. raw_msg_id = ('\t'.join([ self.hdr(msg, 'date'), self.hdr(msg, 'subject'), self.hdr(msg, 'received'), self.hdr(msg, 'from'), self.hdr(msg, 'to') ])).strip() # Fall back to the msg_ptr if all else fails. if not raw_msg_id: print _('WARNING: No proper Message-ID for %s') % msg_ptr return self.encode_msg_id(raw_msg_id or msg_ptr) def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, mailbox_idx) if mbox.editable: session.ui.mark( _('%s: Skipped: %s') % (mailbox_idx, mailbox_fn)) return 0 else: session.ui.mark( _('%s: Checking: %s') % (mailbox_idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark( _('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) snippet_max = session.config.sys.snippet_max added = 0 msg_ts = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = _('%s: Reading your mail: %d%% (%d/%d messages)' ) % (mailbox_idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(mailbox_idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) play_nice_with_threads() continue else: session.ui.mark(parse_status) play_nice_with_threads() # Message new or modified, let's parse it. if 'rescan' in session.config.sys.debug: session.ui.debug('Reading message %s/%s' % (mailbox_idx, i)) try: msg_fd = mbox.get_file(i) msg = ParseMessage( msg_fd, pgpmime=session.config.prefs.index_encrypted) except (IOError, OSError, ValueError, IndexError, KeyError): if session.config.sys.debug: traceback.print_exc() session.ui.warning(('Reading message %s/%s FAILED, skipping') % (mailbox_idx, i)) continue msg_size = msg_fd.tell() msg_id = self.get_msg_id(msg, msg_ptr) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg, msg_ts) play_nice_with_threads() keywords, snippet = self.index_message( session, msg_mid, msg_id, msg, msg_size, msg_ts, mailbox=mailbox_idx, compact=False, filter_hooks=plugins.filter_hooks([self.filter_keywords])) msg_subject = self.hdr(msg, 'subject') msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))] tags = [ k.split(':')[0] for k in keywords if k.endswith(':in') or k.endswith(':tag') ] msg_to = ExtractEmails(self.hdr(msg, 'to')) msg_cc = (ExtractEmails(self.hdr(msg, 'cc')) + ExtractEmails(self.hdr(msg, 'bcc'))) msg_idx_pos, msg_info = self.add_new_msg( msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to, msg_cc, msg_size, msg_subject, msg_snippet, tags) self.set_conversation_ids(msg_info[self.MSG_MID], msg) mbox.mark_parsed(i) added += 1 GlobalPostingList.Optimize(session, self, lazy=True, quick=True) if added: mbox.save(session) session.ui.mark( _('%s: Indexed mailbox: %s') % (mailbox_idx, mailbox_fn)) return added