def remove_tag(self, session, tag_id, msg_info=None, msg_idxs=None, conversation=False): CachedSearchResultSet.DropCaches() pls = GlobalPostingList(session, '%s:tag' % tag_id) if msg_info and msg_idxs is None: msg_idxs = set([int(msg_info[self.MSG_MID], 36)]) else: msg_idxs = set(msg_idxs) if not msg_idxs: return session.ui.mark(_('Untagging conversations (%s)') % (tag_id, )) for msg_idx in list(msg_idxs): if conversation: for reply in self.get_conversation(msg_idx=msg_idx): if reply[self.MSG_MID]: msg_idxs.add(int(reply[self.MSG_MID], 36)) session.ui.mark(_('Untagging %d messages (%s)') % (len(msg_idxs), tag_id)) eids = [] for msg_idx in msg_idxs: if msg_idx >= 0 and msg_idx < len(self.INDEX): msg_info = self.get_msg_at_idx_pos(msg_idx) tags = set([r for r in msg_info[self.MSG_TAGS].split(',') if r]) if tag_id in tags: tags.remove(tag_id) msg_info[self.MSG_TAGS] = ','.join(list(tags)) self.INDEX[msg_idx] = self.m2l(msg_info) self.MODIFIED.add(msg_idx) eids.append(msg_info[self.MSG_MID]) pls.remove(eids) pls.save()
def update_tag_stats(self, session, config, update_tags=None): session = session or Session(config) new_tid = config.get_tag_id('new') new_msgs = (new_tid and GlobalPostingList(session, '%s:tag' % new_tid).hits() or set([])) self.STATS.update({'ALL': [len(self.INDEX), len(new_msgs)]}) for tid in (update_tags or config.tags.keys()): if session: session.ui.mark('Counting messages in tag:%s' % tid) hits = GlobalPostingList(session, '%s:tag' % tid).hits() self.STATS[tid] = [len(hits), len(hits & new_msgs)] return self.STATS
def index_message(self, session, msg_mid, msg_id, msg, msg_ts, mailbox=None, compact=True, filter_hooks=[]): keywords, snippet = self.read_message(session, msg_mid, msg_id, msg, msg_ts, mailbox=mailbox) for hook in filter_hooks: keywords = hook(session, msg_mid, msg, keywords) for word in keywords: try: GlobalPostingList.Append(session, word, [msg_mid], compact=compact) except UnicodeDecodeError: # FIXME: we just ignore garbage pass return keywords, snippet
def command(self): try: self._idx().save(self.session) GlobalPostingList.Optimize(self.session, self._idx(), force=('harder' in self.args)) return True except KeyboardInterrupt: self.session.ui.mark(_('Aborted')) return False
def hits(term): if term.endswith(':in'): return self.TAGS.get(term.rsplit(':', 1)[0], []) else: session.ui.mark(_('Searching for %s') % term) return [ int(h, 36) for h in GlobalPostingList(session, term).hits() ]
def remove_tag(self, session, tag_id, msg_info=None, msg_idxs=None, conversation=False): pls = GlobalPostingList(session, '%s:tag' % tag_id) if msg_info and msg_idxs is None: msg_idxs = set([int(msg_info[self.MSG_MID], 36)]) else: msg_idxs = set(msg_idxs) if not msg_idxs: return session.ui.mark('Untagging conversations (%s)' % (tag_id, )) for msg_idx in list(msg_idxs): if conversation: for reply in self.get_conversation(msg_idx=msg_idx): if reply[self.MSG_MID]: msg_idxs.add(int(reply[self.MSG_MID], 36)) session.ui.mark('Untagging %d messages (%s)' % (len(msg_idxs), tag_id)) eids = [] for msg_idx in msg_idxs: if msg_idx >= 0 and msg_idx < len(self.INDEX): msg_info = self.get_msg_at_idx_pos(msg_idx) tags = set( [r for r in msg_info[self.MSG_TAGS].split(',') if r]) if tag_id in tags: tags.remove(tag_id) msg_info[self.MSG_TAGS] = ','.join(list(tags)) self.INDEX[msg_idx] = self.m2l(msg_info) self.MODIFIED.add(msg_idx) eids.append(msg_info[self.MSG_MID]) pls.remove(eids) pls.save()
def add_tag(self, session, tag_id, msg_info=None, msg_idxs=None, conversation=False): pls = GlobalPostingList(session, '%s:tag' % tag_id) if msg_info and msg_idxs is None: msg_idxs = set([int(msg_info[self.MSG_MID], 36)]) else: msg_idxs = set(msg_idxs) session.ui.mark(_('Tagging %d messages (%s)') % (len(msg_idxs), tag_id)) for msg_idx in list(msg_idxs): if conversation: for reply in self.get_conversation(msg_idx=msg_idx): if reply[self.MSG_MID]: msg_idxs.add(int(reply[self.MSG_MID], 36)) for msg_idx in msg_idxs: if msg_idx >= 0 and msg_idx < len(self.INDEX): msg_info = self.get_msg_at_idx_pos(msg_idx) tags = set([r for r in msg_info[self.MSG_TAGS].split(',') if r]) tags.add(tag_id) msg_info[self.MSG_TAGS] = ','.join(list(tags)) self.INDEX[msg_idx] = self.m2l(msg_info) self.MODIFIED.add(msg_idx) pls.append(msg_info[self.MSG_MID]) pls.save()
def _rescan_mailboxes(self, session, config): idx = self._idx() msg_count = 0 mbox_count = 0 rv = True try: pre_command = config.prefs.rescan_command if pre_command: session.ui.mark(_('Running: %s') % pre_command) subprocess.check_call(pre_command, shell=True) msg_count = 1 for fid, fpath in config.get_mailboxes(): if fpath == '/dev/null': continue if mailpile.util.QUITTING: break try: count = idx.scan_mailbox(session, fid, fpath, config.open_mailbox) except ValueError: session.ui.warning(_('Failed to rescan: %s') % fpath) count = 0 if count: msg_count += count mbox_count += 1 config.clear_mbox_cache() session.ui.mark('\n') msg_count -= 1 if msg_count: if not mailpile.util.QUITTING: idx.cache_sort_orders(session) if not mailpile.util.QUITTING: GlobalPostingList.Optimize(session, idx, quick=True) else: session.ui.mark(_('Nothing changed')) except (KeyboardInterrupt, subprocess.CalledProcessError), e: session.ui.mark(_('Aborted: %s') % e) self._ignore_exception() return { 'aborted': True, 'messages': msg_count, 'mailboxes': mbox_count }
def _rescan_mailboxes(self, session, config): # FIXME: Need a lock here? if 'rescan' in config._running: return True config._running['rescan'] = True idx = self._idx() msg_count = 0 mbox_count = 0 rv = True try: pre_command = config.prefs.rescan_command if pre_command: session.ui.mark('Running: %s' % pre_command) subprocess.check_call(pre_command, shell=True) msg_count = 1 for fid, fpath in config.get_mailboxes(): if fpath == '/dev/null': continue if mailpile.util.QUITTING: break count = idx.scan_mailbox(session, fid, fpath, config.open_mailbox) if count: msg_count += count mbox_count += 1 config.clear_mbox_cache() session.ui.mark('\n') msg_count -= 1 if msg_count: idx.cache_sort_orders(session) if not mailpile.util.QUITTING: GlobalPostingList.Optimize(session, idx, quick=True) else: session.ui.mark('Nothing changed') except (KeyboardInterrupt, subprocess.CalledProcessError), e: session.ui.mark('Aborted: %s' % e) self._ignore_exception() return False
def hits(term): session.ui.mark('Searching for %s' % term) return [ int(h, 36) for h in GlobalPostingList(session, term).hits() ]
class MailIndex: """This is a lazily parsing object representing a mailpile index.""" MSG_MID = 0 MSG_PTRS = 1 MSG_ID = 2 MSG_DATE = 3 MSG_FROM = 4 MSG_TO = 5 MSG_SUBJECT = 6 MSG_SNIPPET = 7 MSG_TAGS = 8 MSG_REPLIES = 9 MSG_CONV_MID = 10 def __init__(self, config): self.config = config self.STATS = {} self.INDEX = [] self.INDEX_SORT = {} self.INDEX_CONV = [] self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} self.CACHE = {} self.MODIFIED = set() self.EMAILS_SAVED = 0 def l2m(self, line): return line.decode('utf-8').split(u'\t') # A translation table for message parts stored in the index, consists of # a mapping from unicode ordinals to either another unicode ordinal or # None, to remove a character. By default it removes the ASCII control # characters and replaces tabs and newlines with spaces. NORM_TABLE = dict( [(i, None) for i in range(0, 0x20)], **{ ord(u'\t'): ord(u' '), ord(u'\r'): ord(u' '), ord(u'\n'): ord(u' '), 0x7F: None }) def m2l(self, message): # Normalize the message before saving it so we can be sure that we will # be able to read it back later. parts = [unicode(p).translate(self.NORM_TABLE) for p in message] return (u'\t'.join(parts)).encode('utf-8') def load(self, session=None): self.INDEX = [] self.CACHE = {} self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} CachedSearchResultSet.DropCaches() def process_line(line): try: line = line.strip() if line.startswith('#'): pass elif line.startswith('@'): pos, email = line[1:].split('\t', 1) pos = int(pos, 36) while len(self.EMAILS) < pos + 1: self.EMAILS.append('') self.EMAILS[pos] = unquote(email) self.EMAIL_IDS[unquote(email).lower()] = pos elif line: words = line.split('\t') # FIXME: Delete this old crap. if len(words) == 10: # This is an old index file, reorder it! pos, p, unused, msgid, d, f, s, t, r, c = words ptrs = ','.join(['0' + ptr for ptr in p.split(',')]) line = '\t'.join( [pos, ptrs, msgid, d, f, '', s, '', t, r, c]) else: pos, ptrs, msgid = words[:3] pos = int(pos, 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[msgid] = pos for msg_ptr in ptrs.split(','): self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark('Loading metadata index...') try: fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning(('Metadata index not found: %s') % self.config.mailindex_file()) self.cache_sort_orders(session) if session: session.ui.mark('Loaded metadata, %d messages' % len(self.INDEX)) self.EMAILS_SAVED = len(self.EMAILS) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods or len(self.EMAILS) > self.EMAILS_SAVED: if session: session.ui.mark("Saving metadata index changes...") fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'a') for eid in range(self.EMAILS_SAVED, len(self.EMAILS)): fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid]))) for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index changes") self.EMAILS_SAVED = len(self.EMAILS) def save(self, session=None): self.MODIFIED = set() if session: session.ui.mark("Saving metadata index...") fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for eid in range(0, len(self.EMAILS)): fd.write('@%s\t%s\n' % (b36(eid), quote(self.EMAILS[eid]))) for item in self.INDEX: fd.write(item + '\n') fd.close() flush_append_cache() if session: session.ui.mark("Saved metadata index") def update_ptrs_and_msgids(self, session): session.ui.mark('Updating high level indexes') for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) > self.MSG_CONV_MID: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning('Bogus line: %s' % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: # Security: RFC822 headers are not allowed to have (unencoded) # non-ascii characters in them, so we just strip them all out # before parsing. # FIXME: This is "safe", but can we be smarter/gentler? value = CleanText(msg[name], replace='_').clean # Note: decode_header does the wrong thing with "quoted" data. decoded = email.header.decode_header((value or '').replace('"', '')) return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx_pos, msg_ptr): msg_info = self.get_msg_at_idx_pos(msg_idx_pos) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx_pos # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if (msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]): msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_at_idx_pos(msg_idx_pos, msg_info) def _parse_date(self, date_hdr): """Parse a Date: or Received: header into a unix timestamp.""" try: if ';' in date_hdr: date_hdr = date_hdr.split(';')[-1].strip() msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr))) if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1): return None else: return msg_ts except (ValueError, TypeError, OverflowError): return None def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date): """Extract a date, sanity checking against the Received: headers.""" hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or []) dates = [self._parse_date(date_hdr) for date_hdr in hdrs] msg_ts = dates[0] nz_dates = sorted([d for d in dates if d]) if nz_dates: median = nz_dates[len(nz_dates) / 2] if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600: return msg_ts else: session.ui.warning( ('=%s/%s using Recieved: instead of Date:') % (msg_mid, msg_id)) return median else: # If the above fails, we assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message if date parsing fails for some reason. session.ui.warning('=%s/%s has a bogus date' % (msg_mid, msg_id)) return last_date + 1 def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, mailbox_idx) if mbox.editable: session.ui.mark('%s: Skipped: %s' % (mailbox_idx, mailbox_fn)) return 0 else: session.ui.mark('%s: Checking: %s' % (mailbox_idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark( ('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) snippet_max = session.config.sys.snippet_max added = 0 msg_ts = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = ('%s: Reading your mail: %d%% (%d/%d messages)') % ( mailbox_idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(mailbox_idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) continue else: session.ui.mark(parse_status) # Message new or modified, let's parse it. msg = ParseMessage(mbox.get_file(i), pgpmime=False) msg_id = b64c( sha1b64((self.hdr(msg, 'message-id') or msg_ptr).strip())) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg, msg_ts) keywords, snippet = self.index_message( session, msg_mid, msg_id, msg, msg_ts, mailbox=mailbox_idx, compact=False, filter_hooks=[self.filter_keywords]) msg_subject = self.hdr(msg, 'subject') msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))] tags = [ k.split(':')[0] for k in keywords if k.endswith(':tag') ] msg_to = (ExtractEmails(self.hdr(msg, 'to')) + ExtractEmails(self.hdr(msg, 'cc')) + ExtractEmails(self.hdr(msg, 'bcc'))) msg_idx_pos, msg_info = self.add_new_msg( msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to, msg_subject, msg_snippet, tags) self.set_conversation_ids(msg_info[self.MSG_MID], msg) mbox.mark_parsed(i) added += 1 if (added % 1000) == 0: GlobalPostingList.Optimize(session, self, quick=True) if added: mbox.save(session) session.ui.mark('%s: Indexed mailbox: %s' % (mailbox_idx, mailbox_fn)) return added
class MailIndex: """This is a lazily parsing object representing a mailpile index.""" MSG_MID = 0 MSG_PTRS = 1 MSG_ID = 2 MSG_DATE = 3 MSG_FROM = 4 MSG_TO = 5 MSG_CC = 6 MSG_KB = 7 MSG_SUBJECT = 8 MSG_BODY = 9 MSG_TAGS = 10 MSG_REPLIES = 11 MSG_THREAD_MID = 12 MSG_FIELDS_V1 = 11 MSG_FIELDS_V2 = 13 BOGUS_METADATA = [ None, '', None, '0', '(no sender)', '', '', '0', '(not in index)', '', '', '', '-1' ] MAX_INCREMENTAL_SAVES = 25 def __init__(self, config): self.config = config self.INDEX = [] self.INDEX_SORT = {} self.INDEX_THR = [] self.PTRS = {} self.TAGS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} self.CACHE = {} self.MODIFIED = set() self.EMAILS_SAVED = 0 self._saved_changes = 0 self._lock = threading.Lock() def l2m(self, line): return line.decode('utf-8').split(u'\t') # A translation table for message parts stored in the index, consists of # a mapping from unicode ordinals to either another unicode ordinal or # None, to remove a character. By default it removes the ASCII control # characters and replaces tabs and newlines with spaces. NORM_TABLE = dict( [(i, None) for i in range(0, 0x20)], **{ ord(u'\t'): ord(u' '), ord(u'\r'): ord(u' '), ord(u'\n'): ord(u' '), 0x7F: None }) def m2l(self, message): # Normalize the message before saving it so we can be sure that we will # be able to read it back later. parts = [unicode(p).translate(self.NORM_TABLE) for p in message] return (u'\t'.join(parts)).encode('utf-8') def load(self, session=None): self.INDEX = [] self.CACHE = {} self.PTRS = {} self.MSGIDS = {} self.EMAILS = [] self.EMAIL_IDS = {} CachedSearchResultSet.DropCaches() def process_line(line): try: line = line.strip() if line.startswith('#'): pass elif line.startswith('@'): pos, email = line[1:].split('\t', 1) pos = int(pos, 36) while len(self.EMAILS) < pos + 1: self.EMAILS.append('') unquoted_email = unquote(email).decode('utf-8') self.EMAILS[pos] = unquoted_email self.EMAIL_IDS[unquoted_email.split()[0].lower()] = pos elif line: words = line.split('\t') # Migration: converting old metadata into new! if len(words) != self.MSG_FIELDS_V2: # V1 -> V2 adds MSG_CC and MSG_KB if len(words) == self.MSG_FIELDS_V1: words[self.MSG_CC:self.MSG_CC] = [''] words[self.MSG_KB:self.MSG_KB] = ['0'] # Add V2 -> V3 here, etc. etc. if len(words) == self.MSG_FIELDS_V2: line = '\t'.join(words) else: raise Exception( _('Your metadata index is either ' 'too old, too new or corrupt!')) pos = int(words[self.MSG_MID], 36) while len(self.INDEX) < pos + 1: self.INDEX.append('') self.INDEX[pos] = line self.MSGIDS[words[self.MSG_ID]] = pos self.update_msg_tags(pos, words) for msg_ptr in words[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = pos except ValueError: pass if session: session.ui.mark(_('Loading metadata index...')) try: self._lock.acquire() fd = open(self.config.mailindex_file(), 'r') for line in fd: if line.startswith(GPG_BEGIN_MESSAGE): for line in decrypt_gpg([line], fd): process_line(line) else: process_line(line) fd.close() except IOError: if session: session.ui.warning( _('Metadata index not found: %s') % self.config.mailindex_file()) finally: self._lock.release() self.cache_sort_orders(session) if session: session.ui.mark( _('Loaded metadata, %d messages') % len(self.INDEX)) self.EMAILS_SAVED = len(self.EMAILS) def update_msg_tags(self, msg_idx_pos, msg_info): tags = set([t for t in msg_info[self.MSG_TAGS].split(',') if t]) for tid in (set(self.TAGS.keys()) - tags): self.TAGS[tid] -= set([msg_idx_pos]) for tid in tags: if tid not in self.TAGS: self.TAGS[tid] = set() self.TAGS[tid].add(msg_idx_pos) def save_changes(self, session=None): mods, self.MODIFIED = self.MODIFIED, set() if mods or len(self.EMAILS) > self.EMAILS_SAVED: if self._saved_changes >= self.MAX_INCREMENTAL_SAVES: return self.save(session=session) try: self._lock.acquire() if session: session.ui.mark(_("Saving metadata index changes...")) fd = gpg_open(self.config.mailindex_file(), self.config.prefs.gpg_recipient, 'a') for eid in range(self.EMAILS_SAVED, len(self.EMAILS)): quoted_email = quote(self.EMAILS[eid].encode('utf-8')) fd.write('@%s\t%s\n' % (b36(eid), quoted_email)) for pos in mods: fd.write(self.INDEX[pos] + '\n') fd.close() flush_append_cache() if session: session.ui.mark(_("Saved metadata index changes")) self.EMAILS_SAVED = len(self.EMAILS) self._saved_changes += 1 finally: self._lock.release() def save(self, session=None): try: self._lock.acquire() self.MODIFIED = set() if session: session.ui.mark(_("Saving metadata index...")) idxfile = self.config.mailindex_file() newfile = '%s.new' % idxfile fd = gpg_open(newfile, self.config.prefs.gpg_recipient, 'w') fd.write('# This is the mailpile.py index file.\n') fd.write('# We have %d messages!\n' % len(self.INDEX)) for eid in range(0, len(self.EMAILS)): quoted_email = quote(self.EMAILS[eid].encode('utf-8')) fd.write('@%s\t%s\n' % (b36(eid), quoted_email)) for item in self.INDEX: fd.write(item + '\n') fd.close() # Keep the last 5 index files around... just in case. backup_file(idxfile, backups=5, min_age_delta=10) os.rename(newfile, idxfile) flush_append_cache() self._saved_changes = 0 if session: session.ui.mark(_("Saved metadata index")) finally: self._lock.release() def update_ptrs_and_msgids(self, session): session.ui.mark(_('Updating high level indexes')) for offset in range(0, len(self.INDEX)): message = self.l2m(self.INDEX[offset]) if len(message) == self.MSG_FIELDS_V2: self.MSGIDS[message[self.MSG_ID]] = offset for msg_ptr in message[self.MSG_PTRS].split(','): self.PTRS[msg_ptr] = offset else: session.ui.warning(_('Bogus line: %s') % line) def try_decode(self, text, charset): for cs in (charset, 'iso-8859-1', 'utf-8'): if cs: try: return text.decode(cs) except (UnicodeEncodeError, UnicodeDecodeError, LookupError): pass return "".join(i for i in text if ord(i) < 128) def hdr(self, msg, name, value=None): try: if value is None and msg: # Security: RFC822 headers are not allowed to have (unencoded) # non-ascii characters in them, so we just strip them all out # before parsing. # FIXME: This is "safe", but can we be smarter/gentler? value = CleanText(msg[name], replace='_').clean # Note: decode_header does the wrong thing with "quoted" data. decoded = email.header.decode_header((value or '').replace('"', '')) return (' '.join([self.try_decode(t[0], t[1]) for t in decoded ])).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ') except email.errors.HeaderParseError: return '' def update_location(self, session, msg_idx_pos, msg_ptr): msg_info = self.get_msg_at_idx_pos(msg_idx_pos) msg_ptrs = msg_info[self.MSG_PTRS].split(',') self.PTRS[msg_ptr] = msg_idx_pos # If message was seen in this mailbox before, update the location for i in range(0, len(msg_ptrs)): if msg_ptrs[i][:MBX_ID_LEN] == msg_ptr[:MBX_ID_LEN]: msg_ptrs[i] = msg_ptr msg_ptr = None break # Otherwise, this is a new mailbox, record this sighting as well! if msg_ptr: msg_ptrs.append(msg_ptr) msg_info[self.MSG_PTRS] = ','.join(msg_ptrs) self.set_msg_at_idx_pos(msg_idx_pos, msg_info) def _parse_date(self, date_hdr): """Parse a Date: or Received: header into a unix timestamp.""" try: if ';' in date_hdr: date_hdr = date_hdr.split(';')[-1].strip() msg_ts = long(rfc822.mktime_tz(rfc822.parsedate_tz(date_hdr))) if (msg_ts > (time.time() + 24 * 3600)) or (msg_ts < 1): return None else: return msg_ts except (ValueError, TypeError, OverflowError): return None def _extract_date_ts(self, session, msg_mid, msg_id, msg, last_date): """Extract a date, sanity checking against the Received: headers.""" hdrs = [self.hdr(msg, 'date')] + (msg.get_all('received') or []) dates = [self._parse_date(date_hdr) for date_hdr in hdrs] msg_ts = dates[0] nz_dates = sorted([d for d in dates if d]) if nz_dates: median = nz_dates[len(nz_dates) / 2] if msg_ts and abs(msg_ts - median) < 31 * 24 * 3600: return msg_ts else: session.ui.warning( _('=%s/%s using Received: instead of Date:') % (msg_mid, msg_id)) return median else: # If the above fails, we assume the messages in the mailbox are in # chronological order and just add 1 second to the date of the last # message if date parsing fails for some reason. session.ui.warning( _('=%s/%s has a bogus date') % (msg_mid, msg_id)) return last_date + 1 def encode_msg_id(self, msg_id): return b64c(sha1b64(msg_id.strip())) def get_msg_id(self, msg, msg_ptr): raw_msg_id = self.hdr(msg, 'message-id') if not raw_msg_id: # Create a very long pseudo-msgid for messages without a # Message-ID. This was a very badly behaved mailer, so if # we create duplicates this way, we are probably only # losing spam. Even then the Received line should save us. raw_msg_id = ('\t'.join([ self.hdr(msg, 'date'), self.hdr(msg, 'subject'), self.hdr(msg, 'received'), self.hdr(msg, 'from'), self.hdr(msg, 'to') ])).strip() # Fall back to the msg_ptr if all else fails. if not raw_msg_id: print _('WARNING: No proper Message-ID for %s') % msg_ptr return self.encode_msg_id(raw_msg_id or msg_ptr) def scan_mailbox(self, session, mailbox_idx, mailbox_fn, mailbox_opener): try: mbox = mailbox_opener(session, mailbox_idx) if mbox.editable: session.ui.mark( _('%s: Skipped: %s') % (mailbox_idx, mailbox_fn)) return 0 else: session.ui.mark( _('%s: Checking: %s') % (mailbox_idx, mailbox_fn)) except (IOError, OSError, NoSuchMailboxError), e: session.ui.mark( _('%s: Error opening: %s (%s)') % (mailbox_idx, mailbox_fn, e)) return 0 unparsed = mbox.unparsed() if not unparsed: return 0 if len(self.PTRS.keys()) == 0: self.update_ptrs_and_msgids(session) snippet_max = session.config.sys.snippet_max added = 0 msg_ts = int(time.time()) for ui in range(0, len(unparsed)): if mailpile.util.QUITTING: break i = unparsed[ui] parse_status = _('%s: Reading your mail: %d%% (%d/%d messages)' ) % (mailbox_idx, 100 * ui / len(unparsed), ui, len(unparsed)) msg_ptr = mbox.get_msg_ptr(mailbox_idx, i) if msg_ptr in self.PTRS: if (ui % 317) == 0: session.ui.mark(parse_status) play_nice_with_threads() continue else: session.ui.mark(parse_status) play_nice_with_threads() # Message new or modified, let's parse it. if 'rescan' in session.config.sys.debug: session.ui.debug('Reading message %s/%s' % (mailbox_idx, i)) try: msg_fd = mbox.get_file(i) msg = ParseMessage( msg_fd, pgpmime=session.config.prefs.index_encrypted) except (IOError, OSError, ValueError, IndexError, KeyError): if session.config.sys.debug: traceback.print_exc() session.ui.warning(('Reading message %s/%s FAILED, skipping') % (mailbox_idx, i)) continue msg_size = msg_fd.tell() msg_id = self.get_msg_id(msg, msg_ptr) if msg_id in self.MSGIDS: self.update_location(session, self.MSGIDS[msg_id], msg_ptr) added += 1 else: # Add new message! msg_mid = b36(len(self.INDEX)) msg_ts = self._extract_date_ts(session, msg_mid, msg_id, msg, msg_ts) play_nice_with_threads() keywords, snippet = self.index_message( session, msg_mid, msg_id, msg, msg_size, msg_ts, mailbox=mailbox_idx, compact=False, filter_hooks=plugins.filter_hooks([self.filter_keywords])) msg_subject = self.hdr(msg, 'subject') msg_snippet = snippet[:max(0, snippet_max - len(msg_subject))] tags = [ k.split(':')[0] for k in keywords if k.endswith(':in') or k.endswith(':tag') ] msg_to = ExtractEmails(self.hdr(msg, 'to')) msg_cc = (ExtractEmails(self.hdr(msg, 'cc')) + ExtractEmails(self.hdr(msg, 'bcc'))) msg_idx_pos, msg_info = self.add_new_msg( msg_ptr, msg_id, msg_ts, self.hdr(msg, 'from'), msg_to, msg_cc, msg_size, msg_subject, msg_snippet, tags) self.set_conversation_ids(msg_info[self.MSG_MID], msg) mbox.mark_parsed(i) added += 1 GlobalPostingList.Optimize(session, self, lazy=True, quick=True) if added: mbox.save(session) session.ui.mark( _('%s: Indexed mailbox: %s') % (mailbox_idx, mailbox_fn)) return added