Пример #1
0
 def process_message(self, peer, mailfrom, rcpttos, data):
     # We can assume that the mailfrom and rcpttos have checked out
     # and this message is indeed intended for us. Spool it to disk
     # and add to the index!
     session, config = self.session, self.session.config
     blank_tid = config.get_tags(type='blank')[0]._key
     idx = config.index
     play_nice_with_threads()
     try:
         message = email.parser.Parser().parsestr(data)
         lid, lmbox = config.open_local_mailbox(session)
         e = Email.Create(idx, lid, lmbox, ephemeral_mid=False)
         idx.add_tag(session,
                     blank_tid,
                     msg_idxs=[e.msg_idx_pos],
                     conversation=False)
         e.update_from_msg(session, message)
         idx.remove_tag(session,
                        blank_tid,
                        msg_idxs=[e.msg_idx_pos],
                        conversation=False)
         return None
     except:
         traceback.print_exc()
         return '400 Oops wtf'
Пример #2
0
 def process_message(self, peer, mailfrom, rcpttos, data):
     # We can assume that the mailfrom and rcpttos have checked out
     # and this message is indeed intended for us. Spool it to disk
     # and add to the index!
     session, config = self.session, self.session.config
     blank_tid = config.get_tags(type='blank')[0]._key
     idx = config.index
     try:
         message = email.parser.Parser().parsestr(data)
         lid, lmbox = config.open_local_mailbox(session)
         e = Email.Create(idx, lid, lmbox, ephemeral_mid=False)
         idx.add_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos],
                     conversation=False)
         e.update_from_msg(session, message)
         idx.remove_tag(session, blank_tid, msg_idxs=[e.msg_idx_pos],
                        conversation=False)
         return None
     except:
         traceback.print_exc()
         return '400 Oops wtf'
Пример #3
0
def migrate_mailboxes(session):
    config = session.config

    def _common_path(paths):
        common_head, junk = os.path.split(paths[0])
        for path in paths:
            head, junk = os.path.split(path)
            while (common_head and common_head != '/' and head and head != '/'
                   and head != common_head):
                # First we try shortening the target path...
                while head and head != '/' and head != common_head:
                    head, junk = os.path.split(head)
                # If that failed, lop one off the common path and try again
                if head != common_head:
                    common_head, junk = os.path.split(common_head)
                    head, junk = os.path.split(path)
        return common_head

    mboxes = []
    maildirs = []
    macmaildirs = []
    thunderbird = []

    spam_tids = [tag._key for tag in config.get_tags(type='spam')]
    trash_tids = [tag._key for tag in config.get_tags(type='trash')]
    inbox_tids = [tag._key for tag in config.get_tags(type='inbox')]

    # Iterate through config.sys.mailbox, sort mailboxes by type
    for mbx_id, path, src in config.get_mailboxes():
        if path == '/dev/null' or src is not None:
            continue
        elif os.path.exists(os.path.join(path, 'Info.plist')):
            macmaildirs.append((mbx_id, path))
        elif os.path.isdir(path):
            maildirs.append((mbx_id, path))
        elif 'thunderbird' in path.lower():
            thunderbird.append((mbx_id, path))
        else:
            mboxes.append((mbx_id, path))

    # macmail: library/mail/v2

    if thunderbird:
        # Create basic mail source...
        if 'tbird' not in config.sources:
            config.sources['tbird'] = {
                'name': 'Thunderbird',
                'protocol': 'mbox',
            }
            config.sources.tbird.discovery.create_tag = True

        config.sources.tbird.discovery.policy = 'read'
        config.sources.tbird.discovery.process_new = True
        tbird_src = MboxMailSource(session, config.sources.tbird)

        # Configure discovery policy?
        root = _common_path([path for mbx_id, path in thunderbird])
        if 'thunderbird' in root.lower():
            # FIXME: This is wrong, we should create a mailbox entry
            #        with the policy 'watch'.
            tbird_src.my_config.discovery.path = root

        # Take over all the mailboxes
        for mbx_id, path in thunderbird:
            mbx = tbird_src.take_over_mailbox(mbx_id)
            if 'inbox' in path.lower():
                mbx.apply_tags.extend(inbox_tids)
            elif 'spam' in path.lower() or 'junk' in path.lower():
                mbx.apply_tags.extend(spam_tids)
            elif 'trash' in path.lower():
                mbx.apply_tags.extend(trash_tids)

        tbird_src.my_config.discovery.policy = 'unknown'

    for name, mailboxes, proto, description, cls in (
        ('mboxes', mboxes, 'mbox', 'Unix mbox files', MboxMailSource),
        ('maildirs', maildirs, 'maildir', 'Maildirs', MaildirMailSource),
    ):
        if mailboxes:
            # Create basic mail source...
            if name not in config.sources:
                config.sources[name] = {'name': description, 'protocol': proto}
                config.sources[name].discovery.create_tag = False
            config.sources[name].discovery.policy = 'read'
            config.sources[name].discovery.process_new = True
            config.sources[name].discovery.apply_tags = inbox_tids[:]
            src = cls(session, config.sources[name])
            for mbx_id, path in mailboxes:
                mbx = src.take_over_mailbox(mbx_id)
            config.sources[name].discovery.policy = 'unknown'

    return True
Пример #4
0
    def command(self):
        session, config, idx = self.session, self.session.config, self._idx()
        tags = self.args or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not hasattr(config, 'autotag'):
            config.autotag = {}

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session,
                                              ['in:%s' % tag.slug] + no_trash
                                              ).as_set()
            session.ui.notify(_('Have %d interesting %s messages'
                                ) % (len(interest[ttype]), ttype))

        retrained = []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session, srch
                                                         ).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # FIXME: Is this a good idea? No single data source
                        # is allowed to be more than 50% of the corpus, to
                        # try and encourage diversity.
                        want = min(at_config.corpus_size / 4,
                                   max(0,
                                       at_config.corpus_size / 2 - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    for msg_idx in tset:
                        e = Email(idx, msg_idx)
                        count += 1
                        count_all += 1
                        session.ui.mark(('Reading %s (%d/%d, %s=%s)'
                                         ) % (e.msg_mid(), count, len(tset),
                                              at_tag.name, which))
                        atagger.learn(at_config,
                                      e.get_msg(),
                                      self._get_keywords(e),
                                      which)

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        session.ui.mark(_('Retrained SpamBayes auto-tagging for %s'
                          ) % ', '.join(retrained))
        return {'retrained': retrained, 'read_messages': count_all}
Пример #5
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'fwded', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] +
                                              no_trash).as_set()
            session.ui.notify(
                _('Have %d interesting %s messages') %
                (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session,
                                                         srch).as_set()
                    interesting.append(etag._key)
                interesting.extend(
                    ['replied', 'fwded', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // 4, max(0,
                                                       full_size - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the liste of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)') %
                                (e.msg_mid(), count, len(tset), at_tag.name,
                                 which))
                            atagger.learn(at_config, e.get_msg(),
                                          self._get_keywords(e), which)
                        except (IndexError, TypeError, ValueError, OSError,
                                IOError):
                            if session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s') %
                                (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s') % ', '.join(
            retrained)
        session.ui.mark(message)
        return self._success(message,
                             result={
                                 'retrained': retrained,
                                 'unreadable': unreadable,
                                 'read_messages': count_all
                             })
Пример #6
0
def migrate_mailboxes(session):
    config = session.config

    def _common_path(paths):
        common_head, junk = os.path.split(paths[0])
        for path in paths:
            head, junk = os.path.split(path)
            while (common_head and common_head != '/' and
                   head and head != '/' and
                   head != common_head):
                # First we try shortening the target path...
                while head and head != '/' and head != common_head:
                    head, junk = os.path.split(head)
                # If that failed, lop one off the common path and try again
                if head != common_head:
                    common_head, junk = os.path.split(common_head)
                    head, junk = os.path.split(path)
        return common_head

    mboxes = []
    maildirs = []
    macmaildirs = []
    thunderbird = []

    spam_tids = [tag._key for tag in config.get_tags(type='spam')]
    trash_tids = [tag._key for tag in config.get_tags(type='trash')]
    inbox_tids = [tag._key for tag in config.get_tags(type='inbox')]

    # Iterate through config.sys.mailbox, sort mailboxes by type
    for mbx_id, path, src in config.get_mailboxes():
        if (path == '/dev/null' or
                path.startswith('src:') or
                src is not None or
                config.is_editable_mailbox(mbx_id)):
            continue
        elif os.path.exists(os.path.join(path, 'Info.plist')):
            macmaildirs.append((mbx_id, path))
        elif os.path.isdir(path):
            maildirs.append((mbx_id, path))
        elif 'thunderbird' in path.lower():
            thunderbird.append((mbx_id, path))
        else:
            mboxes.append((mbx_id, path))

    # macmail: library/mail/v2

    if thunderbird:
        # Create basic mail source...
        if 'tbird' not in config.sources:
            config.sources['tbird'] = {
                'name': 'Thunderbird',
                'protocol': 'mbox',
            }
            config.sources.tbird.discovery.create_tag = True

        config.sources.tbird.discovery.policy = 'read'
        config.sources.tbird.discovery.process_new = True
        tbird_src = MboxMailSource(session, config.sources.tbird)

        # Configure discovery policy?
        root = _common_path([path for mbx_id, path in thunderbird])
        if 'thunderbird' in root.lower():
            # FIXME: This is wrong, we should create a mailbox entry
            #        with the policy 'watch'.
            tbird_src.my_config.discovery.path = root

        # Take over all the mailboxes
        for mbx_id, path in thunderbird:
            mbx = tbird_src.take_over_mailbox(mbx_id)
            if 'inbox' in path.lower():
                mbx.apply_tags.extend(inbox_tids)
            elif 'spam' in path.lower() or 'junk' in path.lower():
                mbx.apply_tags.extend(spam_tids)
            elif 'trash' in path.lower():
                mbx.apply_tags.extend(trash_tids)

        tbird_src.my_config.discovery.policy = 'unknown'

    for name, mailboxes, proto, description, cls in (
        ('mboxes', mboxes, 'mbox', 'Unix mbox files', MboxMailSource),
        ('maildirs', maildirs, 'maildir', 'Maildirs', MaildirMailSource),
    ):
        if mailboxes:
            # Create basic mail source...
            if name not in config.sources:
                config.sources[name] = {
                    'name': description,
                    'protocol': proto
                }
                config.sources[name].discovery.create_tag = False
            config.sources[name].discovery.policy = 'read'
            config.sources[name].discovery.process_new = True
            config.sources[name].discovery.apply_tags = inbox_tids[:]
            src = cls(session, config.sources[name])
            for mbx_id, path in mailboxes:
                mbx = src.take_over_mailbox(mbx_id)
            config.sources[name].discovery.policy = 'unknown'

    return True
Пример #7
0
def migrate_mailboxes(session):
    config = session.config

    # FIXME: This should be using mailpile.vfs.FilePath
    # FIXME: Link new mail sources to a profile... any profile?

    def _common_path(paths):
        common_head, junk = os.path.split(paths[0])
        for path in paths:
            head, junk = os.path.split(path)
            while (common_head and common_head != '/' and
                   head and head != '/' and
                   head != common_head):
                # First we try shortening the target path...
                while head and head != '/' and head != common_head:
                    head, junk = os.path.split(head)
                # If that failed, lop one off the common path and try again
                if head != common_head:
                    common_head, junk = os.path.split(common_head)
                    head, junk = os.path.split(path)
        return common_head

    mailboxes = []
    thunderbird = []

    spam_tids = [tag._key for tag in config.get_tags(type='spam')]
    trash_tids = [tag._key for tag in config.get_tags(type='trash')]
    inbox_tids = [tag._key for tag in config.get_tags(type='inbox')]

    # Iterate through config.sys.mailbox, sort mailboxes by type
    for mbx_id, path, src in config.get_mailboxes(with_mail_source=False):
        if (path.startswith('src:') or
                config.is_editable_mailbox(mbx_id)):
            continue
        elif 'thunderbird' in path.lower():
            thunderbird.append((mbx_id, path))
        else:
            mailboxes.append((mbx_id, path))

    if thunderbird:
        # Create basic mail source...
        if 'tbird' not in config.sources:
            config.sources['tbird'] = {
                'name': 'Thunderbird',
                'protocol': 'mbox',
            }
            config.sources.tbird.discovery.create_tag = True

        config.sources.tbird.discovery.policy = 'read'
        config.sources.tbird.discovery.process_new = True
        tbird_src = LocalMailSource(session, config.sources.tbird)

        # Configure discovery policy?
        root = _common_path([path for mbx_id, path in thunderbird])
        if 'thunderbird' in root.lower():
            # FIXME: This is wrong, we should create a mailbox entry
            #        with the policy 'watch'.
            tbird_src.my_config.discovery.path = root

        # Take over all the mailboxes
        for mbx_id, path in thunderbird:
            mbx = tbird_src.take_over_mailbox(mbx_id)
            if 'inbox' in path.lower():
                mbx.apply_tags.extend(inbox_tids)
            elif 'spam' in path.lower() or 'junk' in path.lower():
                mbx.apply_tags.extend(spam_tids)
            elif 'trash' in path.lower():
                mbx.apply_tags.extend(trash_tids)

        tbird_src.my_config.discovery.policy = 'unknown'

    for name, proto, description, cls in (
        ('mboxes', 'local', 'Local mailboxes', LocalMailSource),
    ):
        if mailboxes:
            # Create basic mail source...
            if name not in config.sources:
                config.sources[name] = {
                    'name': description,
                    'protocol': proto
                }
                config.sources[name].discovery.create_tag = False
            config.sources[name].discovery.policy = 'read'
            config.sources[name].discovery.process_new = True
            config.sources[name].discovery.apply_tags = inbox_tids[:]
            src = cls(session, config.sources[name])
            for mbx_id, path in mailboxes:
                mbx = src.take_over_mailbox(mbx_id)
            config.sources[name].discovery.policy = 'unknown'

    return True
Пример #8
0
def migrate_mailboxes(session):
    config = session.config

    # FIXME: This should be using mailpile.vfs.FilePath
    # FIXME: Link new mail sources to a profile... any profile?

    def _common_path(paths):
        common_head, junk = os.path.split(paths[0])
        for path in paths:
            head, junk = os.path.split(path)
            while (common_head and common_head != '/' and
                   head and head != '/' and
                   head != common_head):
                # First we try shortening the target path...
                while head and head != '/' and head != common_head:
                    head, junk = os.path.split(head)
                # If that failed, lop one off the common path and try again
                if head != common_head:
                    common_head, junk = os.path.split(common_head)
                    head, junk = os.path.split(path)
        return common_head

    mailboxes = []
    thunderbird = []

    spam_tids = [tag._key for tag in config.get_tags(type='spam')]
    trash_tids = [tag._key for tag in config.get_tags(type='trash')]
    inbox_tids = [tag._key for tag in config.get_tags(type='inbox')]

    # Iterate through config.sys.mailbox, sort mailboxes by type
    for mbx_id, path, src in config.get_mailboxes(with_mail_source=False):
        if (path.startswith('src:') or
                config.is_editable_mailbox(mbx_id)):
            continue
        elif 'thunderbird' in path.lower():
            thunderbird.append((mbx_id, path))
        else:
            mailboxes.append((mbx_id, path))

    if thunderbird:
        # Create basic mail source...
        if 'tbird' not in config.sources:
            config.sources['tbird'] = {
                'name': 'Thunderbird',
                'protocol': 'mbox',
            }
            config.sources.tbird.discovery.create_tag = True

        config.sources.tbird.discovery.policy = 'read'
        config.sources.tbird.discovery.process_new = True
        tbird_src = LocalMailSource(session, config.sources.tbird)

        # Configure discovery policy?
        root = _common_path([path for mbx_id, path in thunderbird])
        if 'thunderbird' in root.lower():
            # FIXME: This is wrong, we should create a mailbox entry
            #        with the policy 'watch'.
            tbird_src.my_config.discovery.path = root

        # Take over all the mailboxes
        for mbx_id, path in thunderbird:
            mbx = tbird_src.take_over_mailbox(mbx_id)
            if 'inbox' in path.lower():
                mbx.apply_tags.extend(inbox_tids)
            elif 'spam' in path.lower() or 'junk' in path.lower():
                mbx.apply_tags.extend(spam_tids)
            elif 'trash' in path.lower():
                mbx.apply_tags.extend(trash_tids)

        tbird_src.my_config.discovery.policy = 'unknown'

    for name, proto, description, cls in (
        ('mboxes', 'local', 'Local mailboxes', LocalMailSource),
    ):
        if mailboxes:
            # Create basic mail source...
            if name not in config.sources:
                config.sources[name] = {
                    'name': description,
                    'protocol': proto
                }
                config.sources[name].discovery.create_tag = False
            config.sources[name].discovery.policy = 'read'
            config.sources[name].discovery.process_new = True
            config.sources[name].discovery.apply_tags = inbox_tids[:]
            src = cls(session, config.sources[name])
            for mbx_id, path in mailboxes:
                mbx = src.take_over_mailbox(mbx_id)
            config.sources[name].discovery.policy = 'unknown'

    return True
Пример #9
0
    def command(self):
        session, config, idx = self.session, self.session.config, self._idx()
        tags = self.args or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not hasattr(config, 'autotag'):
            config.autotag = {}

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'fwded', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] +
                                              no_trash).as_set()
            session.ui.notify(
                _('Have %d interesting %s messages') %
                (len(interest[ttype]), ttype))

        retrained = []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session,
                                                         srch).as_set()
                    interesting.append(etag._key)
                interesting.extend(
                    ['replied', 'fwded', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # FIXME: Is this a good idea? No single data source
                        # is allowed to be more than 50% of the corpus, to
                        # try and encourage diversity.
                        want = min(
                            at_config.corpus_size / 4,
                            max(0, at_config.corpus_size / 2 - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    for msg_idx in tset:
                        e = Email(idx, msg_idx)
                        count += 1
                        count_all += 1
                        session.ui.mark(('Reading %s (%d/%d, %s=%s)') %
                                        (e.msg_mid(), count, len(tset),
                                         at_tag.name, which))
                        atagger.learn(at_config, e.get_msg(),
                                      self._get_keywords(e), which)

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        session.ui.mark(
            _('Retrained SpamBayes auto-tagging for %s') %
            ', '.join(retrained))
        return {'retrained': retrained, 'read_messages': count_all}
Пример #10
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'fwded', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session,
                                              ['in:%s' % tag.slug] + no_trash
                                              ).as_set()
            session.ui.notify(_('Have %d interesting %s messages'
                                ) % (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session, srch
                                                         ).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'fwded', 'read', 'tagged',
                                    None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // 4,
                                   max(0, full_size - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the liste of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)'
                                  ) % (e.msg_mid(), count, len(tset),
                                       at_tag.name, which))
                            atagger.learn(at_config,
                                          e.get_msg(),
                                          self._get_keywords(e),
                                          which)
                        except (IndexError, TypeError, ValueError,
                                OSError, IOError):
                            if session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s'
                                  ) % (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s'
                    ) % ', '.join(retrained)
        session.ui.mark(message)
        return self._success(message, result={
            'retrained': retrained,
            'unreadable': unreadable,
            'read_messages': count_all
        })