Exemplo n.º 1
0
    def command(self, save=True):
        session, config, idx = self.session, self.session.config, self._idx()
        mbox_type = config.prefs.export_format

        args = list(self.args)
        if args and ':' in args[-1]:
            mbox_type, path = args.pop(-1).split(':', 1)
        else:
            path = self.export_path(mbox_type)

        if os.path.exists(path):
            return self._error('Already exists: %s' % path)

        msg_idxs = self._choose_messages(args)
        if not msg_idxs:
            session.ui.warning('No messages selected')
            return False

        mbox = self.create_mailbox(mbox_type, path)
        for msg_idx in msg_idxs:
            e = Email(idx, msg_idx)
            session.ui.mark('Exporting =%s ...' % e.msg_mid())
            m = e.get_msg()
            # FIXME: This doesn't work
            #tags = [t.slug for t in e.get_message_tags()]
            #print 'Tags: %s' % tags
            #m['X-Mailpile-Tags'] = ', '.join(tags)
            mbox.add(m)
        mbox.flush()

        session.ui.mark('Exported %d messages to %s' % (len(msg_idxs), path))
        return {'exported': len(msg_idxs), 'created': path}
Exemplo n.º 2
0
    def command(self, save=True):
        session, config, idx = self.session, self.session.config, self._idx()
        mbox_type = config.prefs.export_format

        args = list(self.args)
        if args and ":" in args[-1]:
            mbox_type, path = args.pop(-1).split(":", 1)
        else:
            path = self.export_path(mbox_type)

        if os.path.exists(path):
            return self._error("Already exists: %s" % path)

        msg_idxs = self._choose_messages(args)
        if not msg_idxs:
            session.ui.warning("No messages selected")
            return False

        mbox = self.create_mailbox(mbox_type, path)
        for msg_idx in msg_idxs:
            e = Email(idx, msg_idx)
            session.ui.mark("Exporting =%s ..." % e.msg_mid())
            m = e.get_msg()
            # FIXME: This doesn't work
            # tags = [t.slug for t in e.get_message_tags()]
            # print 'Tags: %s' % tags
            # m['X-Mailpile-Tags'] = ', '.join(tags)
            mbox.add(m)
        mbox.flush()

        session.ui.mark("Exported %d messages to %s" % (len(msg_idxs), path))
        return {"exported": len(msg_idxs), "created": path}
Exemplo n.º 3
0
    def command(self, save=True):
        session, config, idx = self.session, self.session.config, self._idx()
        mbox_type = config.prefs.export_format

        if self.session.config.sys.lockdown:
            return self._error(_('In lockdown, doing nothing.'))

        args = list(self.args)
        if args and ':' in args[-1]:
            mbox_type, path = args.pop(-1).split(':', 1)
        else:
            path = self.export_path(mbox_type)

        if args and args[-1] == 'flat':
            flat = True
            args.pop(-1)
        else:
            flat = False

        if os.path.exists(path):
            return self._error('Already exists: %s' % path)

        msg_idxs = list(self._choose_messages(args))
        if not msg_idxs:
            session.ui.warning('No messages selected')
            return False

        # Exporting messages without their threads barely makes any
        # sense.
        if not flat:
            for i in reversed(range(0, len(msg_idxs))):
                mi = msg_idxs[i]
                msg_idxs[i:i + 1] = [
                    int(m[idx.MSG_MID], 36)
                    for m in idx.get_conversation(msg_idx=mi)
                ]

        # Let's always export in the same order. Stability is nice.
        msg_idxs.sort()

        mbox = self.create_mailbox(mbox_type, path)
        exported = {}
        while msg_idxs:
            msg_idx = msg_idxs.pop(0)
            if msg_idx not in exported:
                e = Email(idx, msg_idx)
                session.ui.mark('Exporting =%s ...' % e.msg_mid())
                mbox.add(e.get_msg())
                exported[msg_idx] = 1

        mbox.flush()

        return self._success(
            _('Exported %d messages to %s') % (len(exported), path), {
                'exported': len(exported),
                'created': path
            })
Exemplo n.º 4
0
    def command(self, save=True):
        session, config, idx = self.session, self.session.config, self._idx()
        mbox_type = config.prefs.export_format

        if self.session.config.sys.lockdown:
            return self._error(_('In lockdown, doing nothing.'))

        args = list(self.args)
        if args and ':' in args[-1]:
            mbox_type, path = args.pop(-1).split(':', 1)
        else:
            path = self.export_path(mbox_type)

        if args and args[-1] == 'flat':
            flat = True
            args.pop(-1)
        else:
            flat = False

        if os.path.exists(path):
            return self._error('Already exists: %s' % path)

        msg_idxs = list(self._choose_messages(args))
        if not msg_idxs:
            session.ui.warning('No messages selected')
            return False

        # Exporting messages without their threads barely makes any
        # sense.
        if not flat:
            for i in reversed(range(0, len(msg_idxs))):
                mi = msg_idxs[i]
                msg_idxs[i:i+1] = [int(m[idx.MSG_MID], 36)
                                   for m in idx.get_conversation(msg_idx=mi)]

        # Let's always export in the same order. Stability is nice.
        msg_idxs.sort()

        mbox = self.create_mailbox(mbox_type, path)
        exported = {}
        while msg_idxs:
            msg_idx = msg_idxs.pop(0)
            if msg_idx not in exported:
                e = Email(idx, msg_idx)
                session.ui.mark('Exporting =%s ...' % e.msg_mid())
                mbox.add(e.get_msg())
                exported[msg_idx] = 1

        mbox.flush()

        return self._success(
            _('Exported %d messages to %s') % (len(exported), path),
            {
                'exported': len(exported),
                'created': path
            })
Exemplo n.º 5
0
    def command(self):
        session, config, idx = self.session, self.session.config, self._idx()
        tags = self.args or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not hasattr(config, 'autotag'):
            config.autotag = {}

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session,
                                              ['in:%s' % tag.slug] + no_trash
                                              ).as_set()
            session.ui.notify(_('Have %d interesting %s messages'
                                ) % (len(interest[ttype]), ttype))

        retrained = []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session, srch
                                                         ).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # FIXME: Is this a good idea? No single data source
                        # is allowed to be more than 50% of the corpus, to
                        # try and encourage diversity.
                        want = min(at_config.corpus_size / 4,
                                   max(0,
                                       at_config.corpus_size / 2 - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    for msg_idx in tset:
                        e = Email(idx, msg_idx)
                        count += 1
                        count_all += 1
                        session.ui.mark(('Reading %s (%d/%d, %s=%s)'
                                         ) % (e.msg_mid(), count, len(tset),
                                              at_tag.name, which))
                        atagger.learn(at_config,
                                      e.get_msg(),
                                      self._get_keywords(e),
                                      which)

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        session.ui.mark(_('Retrained SpamBayes auto-tagging for %s'
                          ) % ', '.join(retrained))
        return {'retrained': retrained, 'read_messages': count_all}
Exemplo n.º 6
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in autotag_configs(config)]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'fwded', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] +
                                              no_trash).as_set()
            session.ui.notify(
                _('Have %d interesting %s messages') %
                (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in autotag_configs(config):
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session,
                                                         srch).as_set()
                    interesting.append(etag._key)
                interesting.extend(
                    ['replied', 'fwded', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // 4, max(0,
                                                       full_size - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the liste of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)') %
                                (e.msg_mid(), count, len(tset), at_tag.name,
                                 which))
                            atagger.learn(at_config, e.get_msg(),
                                          self._get_keywords(e), which)
                        except (IndexError, TypeError, ValueError, OSError,
                                IOError):
                            if session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s') %
                                (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s') % ', '.join(
            retrained)
        session.ui.mark(message)
        return self._success(message,
                             result={
                                 'retrained': retrained,
                                 'unreadable': unreadable,
                                 'read_messages': count_all
                             })
Exemplo n.º 7
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in autotag_configs(config)]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session,
                                              ['in:%s' % tag.slug] + no_trash
                                              ).as_set()
            session.ui.notify(_('Have %d interesting %s messages'
                                ) % (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in autotag_configs(config):
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session, srch
                                                         ).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // len(interesting),
                                   max(0, full_size - len(tset)))
                        # Make sure we always fully utilize our budget
                        if full_size > len(tset) and not ttype:
                            want = full_size - len(tset)

                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the list of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)'
                                  ) % (e.msg_mid(), count, len(tset),
                                       at_tag.name, which))
                            atagger.learn(at_config,
                                          e.get_msg(),
                                          self._get_keywords(e),
                                          which)
                            play_nice_with_threads()
                            if mailpile.util.QUITTING:
                                return self._error('Aborted')
                        except (IndexError, TypeError, ValueError,
                                OSError, IOError):
                            if 'autotag' in session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s'
                                  ) % (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s'
                    ) % ', '.join(retrained)
        session.ui.mark(message)
        return self._success(message, result={
            'retrained': retrained,
            'unreadable': unreadable,
            'read_messages': count_all
        })
Exemplo n.º 8
0
    def command(self):
        session, config, idx = self.session, self.session.config, self._idx()
        tags = self.args or [asb.match_tag for asb in config.prefs.autotag]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not hasattr(config, 'autotag'):
            config.autotag = {}

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'fwded', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] +
                                              no_trash).as_set()
            session.ui.notify(
                _('Have %d interesting %s messages') %
                (len(interest[ttype]), ttype))

        retrained = []
        count_all = 0
        for at_config in config.prefs.autotag:
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session,
                                                         srch).as_set()
                    interesting.append(etag._key)
                interesting.extend(
                    ['replied', 'fwded', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # FIXME: Is this a good idea? No single data source
                        # is allowed to be more than 50% of the corpus, to
                        # try and encourage diversity.
                        want = min(
                            at_config.corpus_size / 4,
                            max(0, at_config.corpus_size / 2 - len(tset)))
                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    for msg_idx in tset:
                        e = Email(idx, msg_idx)
                        count += 1
                        count_all += 1
                        session.ui.mark(('Reading %s (%d/%d, %s=%s)') %
                                        (e.msg_mid(), count, len(tset),
                                         at_tag.name, which))
                        atagger.learn(at_config, e.get_msg(),
                                      self._get_keywords(e), which)

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        session.ui.mark(
            _('Retrained SpamBayes auto-tagging for %s') %
            ', '.join(retrained))
        return {'retrained': retrained, 'read_messages': count_all}