예제 #1
0
def Score(driver, manager, mapi_folder, subject, options, stream=None):
    num = 0
    if options.all:
        getter = driver.GetAllItems
        getter_args = (mapi_folder, )
    else:
        getter = driver.GetItemsWithValue
        getter_args = (mapi_folder, PR_SUBJECT_A, subject)
    for item in getter(*getter_args):
        num += 1
        if num % 1000 == 0:
            print >> sys.stderr, "Processed", num, "items..."
        hr, props = item.GetProps((PR_ENTRYID, PR_STORE_ENTRYID, PR_SUBJECT_A),
                                  0)
        (tag, eid), (tag, store_eid), (tag, sub) = props
        eid = mapi.HexFromBin(eid)
        store_eid = mapi.HexFromBin(store_eid)
        try:
            msm = manager.message_store.GetMessage((store_eid, eid))

            manager.classifier_data.message_db.load_msg(msm)
            score = manager.score(msm)
            if not options.quiet: print "Message %r scored %g" % (sub, score)
            if options.show_clues:
                clues = GetClues(manager, msm)
                if not options.quiet: print >> stream, clues
            if options.quiet:
                continue

            if options.show_image_info:
                eob = msm.GetEmailPackageObject()
                # Show what the OCR managed to extract.
                from spambayes.ImageStripper import crack_images
                from spambayes.tokenizer import imageparts
                image_text, image_toks = crack_images(imageparts(eob))
                print >> stream, "Image text:", repr(image_text)
                print >> stream, "Image tokens:", repr(image_toks)

            print >> stream  # blank lines between messages
        except:
            print >> sys.stderr, "FAILED to convert message:", sub
            raise
    print >> stream, "Scored", num, "messages."
예제 #2
0
def Score(driver, manager, mapi_folder, subject, options, stream=None):
    num = 0
    if options.all:
        getter = driver.GetAllItems
        getter_args = (mapi_folder,)
    else:
        getter = driver.GetItemsWithValue
        getter_args = (mapi_folder, PR_SUBJECT_A, subject)
    for item in getter(*getter_args):
        num += 1
        if num % 1000 == 0:
            print >> sys.stderr, "Processed", num, "items..."
        hr, props = item.GetProps((PR_ENTRYID,PR_STORE_ENTRYID, PR_SUBJECT_A), 0)
        (tag, eid), (tag, store_eid), (tag, sub) = props
        eid = mapi.HexFromBin(eid)
        store_eid = mapi.HexFromBin(store_eid)
        try:
            msm = manager.message_store.GetMessage((store_eid, eid))

            manager.classifier_data.message_db.load_msg(msm)
            score = manager.score(msm)
            if not options.quiet: print "Message %r scored %g" % (sub, score)
            if options.show_clues:
                clues = GetClues(manager, msm)
                if not options.quiet: print >> stream, clues
            if options.quiet:
                continue

            if options.show_image_info:
                eob = msm.GetEmailPackageObject()
                # Show what the OCR managed to extract.
                from spambayes.ImageStripper import crack_images
                from spambayes.tokenizer import imageparts
                image_text, image_toks = crack_images(imageparts(eob))
                print >> stream, "Image text:", repr(image_text)
                print >> stream, "Image tokens:", repr(image_toks)

            print >> stream # blank lines between messages
        except:
            print >> sys.stderr, "FAILED to convert message:", sub
            raise
    print >> stream, "Scored", num, "messages."
예제 #3
0
 def tokenize_body(self, msg):
     """Generate a stream of tokens from an email Message.
     If options['Tokenizer', 'check_octets'] is True, the first few
     undecoded characters of application/octet-stream parts of the
     message body become tokens.
     """
     if options["Tokenizer", "check_octets"]:
         for part in octetparts(msg):
             try:
                 text = part.get_payload(decode=True)
             except:
                 yield "control: couldn't decode octet"
                 text = part.get_payload(decode=False)
             if text is None:
                 yield "control: octet payload is None"
                 continue
             yield "octet:%s" % text[:options["Tokenizer",
                                              "octet_prefix_size"]]
     parts = imageparts(msg)
     if options["Tokenizer", "image_size"]:
         total_len = 0
         for part in parts:
             try:
                 text = part.get_payload(decode=True)
             except:
                 yield "control: couldn't decode image"
                 text = part.get_payload(decode=False)
             total_len += len(text or "")
             if text is None:
                 yield "control: image payload is None"
         if total_len:
             yield "image-size:2**%d" % round(log2(total_len))
     if options["Tokenizer", "crack_images"]:
         engine_name = options["Tokenizer", 'ocr_engine']
         from spambayes.ImageStripper import crack_images
         text, tokens = crack_images(engine_name, parts)
         for t in tokens:
             yield t
         for t in self.tokenize_text(text):
             yield t
     for part in textparts(msg):
         try:
             text = part.get_payload(decode=True)
         except:
             yield "control: couldn't decode"
             text = part.get_payload(decode=False)
             if text is not None:
                 text = try_to_repair_damaged_base64(text)
         if text is None:
             yield 'control: payload is None'
             continue
         text = numeric_entity_re.sub(numeric_entity_replacer, text)
         text = text.lower()
         if options["Tokenizer", "replace_nonascii_chars"]:
             text = text.translate(non_ascii_translate_tab)
         for t in find_html_virus_clues(text):
             yield "virus:%s" % t
         for cracker in (crack_uuencode,
                         crack_urls,
                         crack_html_style,
                         crack_html_comment,
                         crack_noframes):
             text, tokens = cracker(text)
             for t in tokens:
                 yield t
         text = breaking_entity_re.sub(' ', text)
         text = html_re.sub('', text)
         for t in self.tokenize_text(text):
             yield t