def process_message(self, peer, mailfrom, rcpttos, data): # We can assume that the mailfrom and rcpttos have checked out # and this message is indeed intended for us. Spool it to disk # and add to the index! session, config = self.session, self.session.config blank_tid = config.get_tags(type='blank')[0]._key idx = config.index play_nice_with_threads() try: message = email.parser.Parser().parsestr(data) lid, lmbox = config.open_local_mailbox(session) e = Email.Create(idx, lid, lmbox, ephemeral_mid=False) idx.add_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos], conversation=False) e.update_from_msg(session, message) idx.remove_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos], conversation=False) return None except: traceback.print_exc() return '400 Oops wtf'
def process_message(self, peer, mailfrom, rcpttos, data): # We can assume that the mailfrom and rcpttos have checked out # and this message is indeed intended for us. Spool it to disk # and add to the index! session, config = self.session, self.session.config blank_tid = config.get_tags(type='blank')[0]._key idx = config.index try: message = email.parser.Parser().parsestr(data) lid, lmbox = config.open_local_mailbox(session) e = Email.Create(idx, lid, lmbox, ephemeral_mid=False) idx.add_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos], conversation=False) e.update_from_msg(session, message) idx.remove_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos], conversation=False) return None except: traceback.print_exc() return '400 Oops wtf'
def migrate_mailboxes(session): config = session.config def _common_path(paths): common_head, junk = os.path.split(paths[0]) for path in paths: head, junk = os.path.split(path) while (common_head and common_head != '/' and head and head != '/' and head != common_head): # First we try shortening the target path... while head and head != '/' and head != common_head: head, junk = os.path.split(head) # If that failed, lop one off the common path and try again if head != common_head: common_head, junk = os.path.split(common_head) head, junk = os.path.split(path) return common_head mboxes = [] maildirs = [] macmaildirs = [] thunderbird = [] spam_tids = [tag._key for tag in config.get_tags(type='spam')] trash_tids = [tag._key for tag in config.get_tags(type='trash')] inbox_tids = [tag._key for tag in config.get_tags(type='inbox')] # Iterate through config.sys.mailbox, sort mailboxes by type for mbx_id, path, src in config.get_mailboxes(): if path == '/dev/null' or src is not None: continue elif os.path.exists(os.path.join(path, 'Info.plist')): macmaildirs.append((mbx_id, path)) elif os.path.isdir(path): maildirs.append((mbx_id, path)) elif 'thunderbird' in path.lower(): thunderbird.append((mbx_id, path)) else: mboxes.append((mbx_id, path)) # macmail: library/mail/v2 if thunderbird: # Create basic mail source... if 'tbird' not in config.sources: config.sources['tbird'] = { 'name': 'Thunderbird', 'protocol': 'mbox', } config.sources.tbird.discovery.create_tag = True config.sources.tbird.discovery.policy = 'read' config.sources.tbird.discovery.process_new = True tbird_src = MboxMailSource(session, config.sources.tbird) # Configure discovery policy? root = _common_path([path for mbx_id, path in thunderbird]) if 'thunderbird' in root.lower(): # FIXME: This is wrong, we should create a mailbox entry # with the policy 'watch'. tbird_src.my_config.discovery.path = root # Take over all the mailboxes for mbx_id, path in thunderbird: mbx = tbird_src.take_over_mailbox(mbx_id) if 'inbox' in path.lower(): mbx.apply_tags.extend(inbox_tids) elif 'spam' in path.lower() or 'junk' in path.lower(): mbx.apply_tags.extend(spam_tids) elif 'trash' in path.lower(): mbx.apply_tags.extend(trash_tids) tbird_src.my_config.discovery.policy = 'unknown' for name, mailboxes, proto, description, cls in ( ('mboxes', mboxes, 'mbox', 'Unix mbox files', MboxMailSource), ('maildirs', maildirs, 'maildir', 'Maildirs', MaildirMailSource), ): if mailboxes: # Create basic mail source... if name not in config.sources: config.sources[name] = {'name': description, 'protocol': proto} config.sources[name].discovery.create_tag = False config.sources[name].discovery.policy = 'read' config.sources[name].discovery.process_new = True config.sources[name].discovery.apply_tags = inbox_tids[:] src = cls(session, config.sources[name]) for mbx_id, path in mailboxes: mbx = src.take_over_mailbox(mbx_id) config.sources[name].discovery.policy = 'unknown' return True
def command(self): session, config, idx = self.session, self.session.config, self._idx() tags = self.args or [asb.match_tag for asb in config.prefs.autotag] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not hasattr(config, 'autotag'): config.autotag = {} # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash ).as_set() session.ui.notify(_('Have %d interesting %s messages' ) % (len(interest[ttype]), ttype)) retrained = [] count_all = 0 for at_config in config.prefs.autotag: at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch ).as_set() interesting.append(etag._key) interesting.extend(['replied', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # FIXME: Is this a good idea? No single data source # is allowed to be more than 50% of the corpus, to # try and encourage diversity. want = min(at_config.corpus_size / 4, max(0, at_config.corpus_size / 2 - len(tset))) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 for msg_idx in tset: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark(('Reading %s (%d/%d, %s=%s)' ) % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) session.ui.mark(_('Retrained SpamBayes auto-tagging for %s' ) % ', '.join(retrained)) return {'retrained': retrained, 'read_messages': count_all}
def _retrain(self, tags=None): "Retrain autotaggers" session, config, idx = self.session, self.session.config, self._idx() tags = tags or [asb.match_tag for asb in config.prefs.autotag] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not config.real_hasattr('autotag'): config.real_setattr('autotag', {}) # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'fwded', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash).as_set() session.ui.notify( _('Have %d interesting %s messages') % (len(interest[ttype]), ttype)) retrained, unreadable = [], [] count_all = 0 for at_config in config.prefs.autotag: at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch).as_set() interesting.append(etag._key) interesting.extend( ['replied', 'fwded', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # False positives are really annoying, and generally # speaking any autotagged subset should be a small # part of the Universe. So we divide the corpus # budget 33% True, 67% False. full_size = int(at_config.corpus_size * (0.33 if which else 0.67)) want = min(full_size // 4, max(0, full_size - len(tset))) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 # We go through the liste of message in order, to avoid # thrashing caches too badly. for msg_idx in sorted(list(tset)): try: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark( _('Reading %s (%d/%d, %s=%s)') % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) except (IndexError, TypeError, ValueError, OSError, IOError): if session.config.sys.debug: import traceback traceback.print_exc() unreadable.append(msg_idx) session.ui.warning( _('Failed to process message at =%s') % (b36(msg_idx))) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) message = _('Retrained SpamBayes auto-tagging for %s') % ', '.join( retrained) session.ui.mark(message) return self._success(message, result={ 'retrained': retrained, 'unreadable': unreadable, 'read_messages': count_all })
def migrate_mailboxes(session): config = session.config def _common_path(paths): common_head, junk = os.path.split(paths[0]) for path in paths: head, junk = os.path.split(path) while (common_head and common_head != '/' and head and head != '/' and head != common_head): # First we try shortening the target path... while head and head != '/' and head != common_head: head, junk = os.path.split(head) # If that failed, lop one off the common path and try again if head != common_head: common_head, junk = os.path.split(common_head) head, junk = os.path.split(path) return common_head mboxes = [] maildirs = [] macmaildirs = [] thunderbird = [] spam_tids = [tag._key for tag in config.get_tags(type='spam')] trash_tids = [tag._key for tag in config.get_tags(type='trash')] inbox_tids = [tag._key for tag in config.get_tags(type='inbox')] # Iterate through config.sys.mailbox, sort mailboxes by type for mbx_id, path, src in config.get_mailboxes(): if (path == '/dev/null' or path.startswith('src:') or src is not None or config.is_editable_mailbox(mbx_id)): continue elif os.path.exists(os.path.join(path, 'Info.plist')): macmaildirs.append((mbx_id, path)) elif os.path.isdir(path): maildirs.append((mbx_id, path)) elif 'thunderbird' in path.lower(): thunderbird.append((mbx_id, path)) else: mboxes.append((mbx_id, path)) # macmail: library/mail/v2 if thunderbird: # Create basic mail source... if 'tbird' not in config.sources: config.sources['tbird'] = { 'name': 'Thunderbird', 'protocol': 'mbox', } config.sources.tbird.discovery.create_tag = True config.sources.tbird.discovery.policy = 'read' config.sources.tbird.discovery.process_new = True tbird_src = MboxMailSource(session, config.sources.tbird) # Configure discovery policy? root = _common_path([path for mbx_id, path in thunderbird]) if 'thunderbird' in root.lower(): # FIXME: This is wrong, we should create a mailbox entry # with the policy 'watch'. tbird_src.my_config.discovery.path = root # Take over all the mailboxes for mbx_id, path in thunderbird: mbx = tbird_src.take_over_mailbox(mbx_id) if 'inbox' in path.lower(): mbx.apply_tags.extend(inbox_tids) elif 'spam' in path.lower() or 'junk' in path.lower(): mbx.apply_tags.extend(spam_tids) elif 'trash' in path.lower(): mbx.apply_tags.extend(trash_tids) tbird_src.my_config.discovery.policy = 'unknown' for name, mailboxes, proto, description, cls in ( ('mboxes', mboxes, 'mbox', 'Unix mbox files', MboxMailSource), ('maildirs', maildirs, 'maildir', 'Maildirs', MaildirMailSource), ): if mailboxes: # Create basic mail source... if name not in config.sources: config.sources[name] = { 'name': description, 'protocol': proto } config.sources[name].discovery.create_tag = False config.sources[name].discovery.policy = 'read' config.sources[name].discovery.process_new = True config.sources[name].discovery.apply_tags = inbox_tids[:] src = cls(session, config.sources[name]) for mbx_id, path in mailboxes: mbx = src.take_over_mailbox(mbx_id) config.sources[name].discovery.policy = 'unknown' return True
def migrate_mailboxes(session): config = session.config # FIXME: This should be using mailpile.vfs.FilePath # FIXME: Link new mail sources to a profile... any profile? def _common_path(paths): common_head, junk = os.path.split(paths[0]) for path in paths: head, junk = os.path.split(path) while (common_head and common_head != '/' and head and head != '/' and head != common_head): # First we try shortening the target path... while head and head != '/' and head != common_head: head, junk = os.path.split(head) # If that failed, lop one off the common path and try again if head != common_head: common_head, junk = os.path.split(common_head) head, junk = os.path.split(path) return common_head mailboxes = [] thunderbird = [] spam_tids = [tag._key for tag in config.get_tags(type='spam')] trash_tids = [tag._key for tag in config.get_tags(type='trash')] inbox_tids = [tag._key for tag in config.get_tags(type='inbox')] # Iterate through config.sys.mailbox, sort mailboxes by type for mbx_id, path, src in config.get_mailboxes(with_mail_source=False): if (path.startswith('src:') or config.is_editable_mailbox(mbx_id)): continue elif 'thunderbird' in path.lower(): thunderbird.append((mbx_id, path)) else: mailboxes.append((mbx_id, path)) if thunderbird: # Create basic mail source... if 'tbird' not in config.sources: config.sources['tbird'] = { 'name': 'Thunderbird', 'protocol': 'mbox', } config.sources.tbird.discovery.create_tag = True config.sources.tbird.discovery.policy = 'read' config.sources.tbird.discovery.process_new = True tbird_src = LocalMailSource(session, config.sources.tbird) # Configure discovery policy? root = _common_path([path for mbx_id, path in thunderbird]) if 'thunderbird' in root.lower(): # FIXME: This is wrong, we should create a mailbox entry # with the policy 'watch'. tbird_src.my_config.discovery.path = root # Take over all the mailboxes for mbx_id, path in thunderbird: mbx = tbird_src.take_over_mailbox(mbx_id) if 'inbox' in path.lower(): mbx.apply_tags.extend(inbox_tids) elif 'spam' in path.lower() or 'junk' in path.lower(): mbx.apply_tags.extend(spam_tids) elif 'trash' in path.lower(): mbx.apply_tags.extend(trash_tids) tbird_src.my_config.discovery.policy = 'unknown' for name, proto, description, cls in ( ('mboxes', 'local', 'Local mailboxes', LocalMailSource), ): if mailboxes: # Create basic mail source... if name not in config.sources: config.sources[name] = { 'name': description, 'protocol': proto } config.sources[name].discovery.create_tag = False config.sources[name].discovery.policy = 'read' config.sources[name].discovery.process_new = True config.sources[name].discovery.apply_tags = inbox_tids[:] src = cls(session, config.sources[name]) for mbx_id, path in mailboxes: mbx = src.take_over_mailbox(mbx_id) config.sources[name].discovery.policy = 'unknown' return True
def command(self): session, config, idx = self.session, self.session.config, self._idx() tags = self.args or [asb.match_tag for asb in config.prefs.autotag] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not hasattr(config, 'autotag'): config.autotag = {} # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'fwded', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash).as_set() session.ui.notify( _('Have %d interesting %s messages') % (len(interest[ttype]), ttype)) retrained = [] count_all = 0 for at_config in config.prefs.autotag: at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch).as_set() interesting.append(etag._key) interesting.extend( ['replied', 'fwded', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # FIXME: Is this a good idea? No single data source # is allowed to be more than 50% of the corpus, to # try and encourage diversity. want = min( at_config.corpus_size / 4, max(0, at_config.corpus_size / 2 - len(tset))) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 for msg_idx in tset: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark(('Reading %s (%d/%d, %s=%s)') % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) session.ui.mark( _('Retrained SpamBayes auto-tagging for %s') % ', '.join(retrained)) return {'retrained': retrained, 'read_messages': count_all}
def _retrain(self, tags=None): "Retrain autotaggers" session, config, idx = self.session, self.session.config, self._idx() tags = tags or [asb.match_tag for asb in config.prefs.autotag] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not config.real_hasattr('autotag'): config.real_setattr('autotag', {}) # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'fwded', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash ).as_set() session.ui.notify(_('Have %d interesting %s messages' ) % (len(interest[ttype]), ttype)) retrained, unreadable = [], [] count_all = 0 for at_config in config.prefs.autotag: at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch ).as_set() interesting.append(etag._key) interesting.extend(['replied', 'fwded', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # False positives are really annoying, and generally # speaking any autotagged subset should be a small # part of the Universe. So we divide the corpus # budget 33% True, 67% False. full_size = int(at_config.corpus_size * (0.33 if which else 0.67)) want = min(full_size // 4, max(0, full_size - len(tset))) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 # We go through the liste of message in order, to avoid # thrashing caches too badly. for msg_idx in sorted(list(tset)): try: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark( _('Reading %s (%d/%d, %s=%s)' ) % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) except (IndexError, TypeError, ValueError, OSError, IOError): if session.config.sys.debug: import traceback traceback.print_exc() unreadable.append(msg_idx) session.ui.warning( _('Failed to process message at =%s' ) % (b36(msg_idx))) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) message = _('Retrained SpamBayes auto-tagging for %s' ) % ', '.join(retrained) session.ui.mark(message) return self._success(message, result={ 'retrained': retrained, 'unreadable': unreadable, 'read_messages': count_all })