def _get_message_keys(self, messageid): keys = self.key_cache.get(messageid, []) if not keys: email = Email(self._idx(), messageid) # First we check the Autocrypt headers msg = email.get_msg(pgpmime='all') for ach in ([extract_autocrypt_header(msg)] + extract_autocrypt_gossip_headers(msg)): if 'keydata' in ach: for keydata in get_keydata(ach['keydata'], autocrypt_header=ach, include_subkeys=False): keys.append((keydata, ach['keydata'])) # Then go looking at the attachments attachments = email.get_message_tree(want=["attachments"] )["attachments"] for part in attachments: if len(keys) > 100: # Just to set some limit... break if _might_be_pgp_key(part["filename"], part["mimetype"]): key = part["part"].get_payload(None, True) for keydata in get_keydata(key, include_subkeys=False): keys.append((keydata, key)) self.key_cache[messageid] = keys return keys
def _get_message_keys(self, messageid, autocrypt=True, autocrypt_gossip=True, attachments=True): keys = self.key_cache.get(messageid, []) if not keys: email = Email(self._idx(), messageid) # First we check the Autocrypt headers loop_count = 0 msg = email.get_msg(pgpmime='all') ac_headers = [] if autocrypt: ac_headers.append(extract_autocrypt_header(msg)) if autocrypt_gossip: ac_headers.extend(extract_autocrypt_gossip_headers(msg)) for ach in ac_headers: loop_count += 1 if 'keydata' in ach: for keyinfo in get_keyinfo(ach['keydata'], autocrypt_header=ach, key_info_class=MailpileKeyInfo): keyinfo.is_autocrypt = True keyinfo.is_gossip = (loop_count > 1) keys.append((keyinfo, ach['keydata'])) # Then go looking at the attachments atts = [] if attachments: atts.extend( email.get_message_tree( want=["attachments"])["attachments"]) for part in atts: if len(keys) > 100: # Just to set some limit... break if _might_be_pgp_key(part["filename"], part["mimetype"]): key = part["part"].get_payload(None, True) for keyinfo in get_keyinfo(key, key_info_class=MailpileKeyInfo): keys.append((keyinfo, key)) self.key_cache[messageid] = keys return keys
def _retrain(self, tags=None): "Retrain autotaggers" session, config, idx = self.session, self.session.config, self._idx() tags = tags or [asb.match_tag for asb in autotag_configs(config)] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not config.real_hasattr('autotag'): config.real_setattr('autotag', {}) # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash).as_set() session.ui.notify( _('Have %d interesting %s messages') % (len(interest[ttype]), ttype)) retrained, unreadable = [], [] count_all = 0 for at_config in autotag_configs(config): at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch).as_set() interesting.append(etag._key) interesting.extend(['replied', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # False positives are really annoying, and generally # speaking any autotagged subset should be a small # part of the Universe. So we divide the corpus # budget 33% True, 67% False. full_size = int(at_config.corpus_size * (0.33 if which else 0.67)) want = min(full_size // len(interesting), max(0, full_size - len(tset))) # Make sure we always fully utilize our budget if full_size > len(tset) and not ttype: want = full_size - len(tset) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 # We go through the list of message in order, to avoid # thrashing caches too badly. for msg_idx in sorted(list(tset)): try: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark( _('Reading %s (%d/%d, %s=%s)') % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) play_nice_with_threads() if mailpile.util.QUITTING: return self._error('Aborted') except (IndexError, TypeError, ValueError, OSError, IOError): if 'autotag' in session.config.sys.debug: import traceback traceback.print_exc() unreadable.append(msg_idx) session.ui.warning( _('Failed to process message at =%s') % (b36(msg_idx))) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) message = _('Retrained SpamBayes auto-tagging for %s') % ', '.join( retrained) session.ui.mark(message) return self._success(message, result={ 'retrained': retrained, 'unreadable': unreadable, 'read_messages': count_all })
def _retrain(self, tags=None): "Retrain autotaggers" session, config, idx = self.session, self.session.config, self._idx() tags = tags or [asb.match_tag for asb in autotag_configs(config)] tids = [config.get_tag(t)._key for t in tags if t] session.ui.mark(_('Retraining SpamBayes autotaggers')) if not config.real_hasattr('autotag'): config.real_setattr('autotag', {}) # Find all the interesting messages! We don't look in the trash, # but we do look at interesting spam. # # Note: By specifically stating that we DON'T want trash, we # disable the search engine's default result suppression # and guarantee these results don't corrupt the somewhat # lame/broken result cache. # no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')] interest = {} for ttype in ('replied', 'read', 'tagged'): interest[ttype] = set() for tag in config.get_tags(type=ttype): interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] + no_trash ).as_set() session.ui.notify(_('Have %d interesting %s messages' ) % (len(interest[ttype]), ttype)) retrained, unreadable = [], [] count_all = 0 for at_config in autotag_configs(config): at_tag = config.get_tag(at_config.match_tag) if at_tag and at_tag._key in tids: session.ui.mark('Retraining: %s' % at_tag.name) yn = [(set(), set(), 'in:%s' % at_tag.slug, True), (set(), set(), '-in:%s' % at_tag.slug, False)] # Get the current message sets: tagged and untagged messages # excluding trash. for tset, mset, srch, which in yn: mset |= idx.search(session, [srch] + no_trash).as_set() # If we have any exclude_tags, they are particularly # interesting, so we'll look at them first. interesting = [] for etagid in at_config.exclude_tags: etag = config.get_tag(etagid) if etag._key not in interest: srch = ['in:%s' % etag._key] + no_trash interest[etag._key] = idx.search(session, srch ).as_set() interesting.append(etag._key) interesting.extend(['replied', 'read', 'tagged', None]) # Go through the interest types in order of preference and # while we still lack training data, add to the training set. for ttype in interesting: for tset, mset, srch, which in yn: # False positives are really annoying, and generally # speaking any autotagged subset should be a small # part of the Universe. So we divide the corpus # budget 33% True, 67% False. full_size = int(at_config.corpus_size * (0.33 if which else 0.67)) want = min(full_size // len(interesting), max(0, full_size - len(tset))) # Make sure we always fully utilize our budget if full_size > len(tset) and not ttype: want = full_size - len(tset) if want: if ttype: adding = sorted(list(mset & interest[ttype])) else: adding = sorted(list(mset)) adding = set(list(reversed(adding))[:want]) tset |= adding mset -= adding # Load classifier, reset atagger = config.load_auto_tagger(at_config) atagger.reset(at_config) for tset, mset, srch, which in yn: count = 0 # We go through the list of message in order, to avoid # thrashing caches too badly. for msg_idx in sorted(list(tset)): try: e = Email(idx, msg_idx) count += 1 count_all += 1 session.ui.mark( _('Reading %s (%d/%d, %s=%s)' ) % (e.msg_mid(), count, len(tset), at_tag.name, which)) atagger.learn(at_config, e.get_msg(), self._get_keywords(e), which) play_nice_with_threads() if mailpile.util.QUITTING: return self._error('Aborted') except (IndexError, TypeError, ValueError, OSError, IOError): if 'autotag' in session.config.sys.debug: import traceback traceback.print_exc() unreadable.append(msg_idx) session.ui.warning( _('Failed to process message at =%s' ) % (b36(msg_idx))) # We got this far without crashing, so save the result. config.save_auto_tagger(at_config) retrained.append(at_tag.name) message = _('Retrained SpamBayes auto-tagging for %s' ) % ', '.join(retrained) session.ui.mark(message) return self._success(message, result={ 'retrained': retrained, 'unreadable': unreadable, 'read_messages': count_all })