def Score(driver, manager, mapi_folder, subject, options, stream=None): num = 0 if options.all: getter = driver.GetAllItems getter_args = (mapi_folder, ) else: getter = driver.GetItemsWithValue getter_args = (mapi_folder, PR_SUBJECT_A, subject) for item in getter(*getter_args): num += 1 if num % 1000 == 0: print >> sys.stderr, "Processed", num, "items..." hr, props = item.GetProps((PR_ENTRYID, PR_STORE_ENTRYID, PR_SUBJECT_A), 0) (tag, eid), (tag, store_eid), (tag, sub) = props eid = mapi.HexFromBin(eid) store_eid = mapi.HexFromBin(store_eid) try: msm = manager.message_store.GetMessage((store_eid, eid)) manager.classifier_data.message_db.load_msg(msm) score = manager.score(msm) if not options.quiet: print "Message %r scored %g" % (sub, score) if options.show_clues: clues = GetClues(manager, msm) if not options.quiet: print >> stream, clues if options.quiet: continue if options.show_image_info: eob = msm.GetEmailPackageObject() # Show what the OCR managed to extract. from spambayes.ImageStripper import crack_images from spambayes.tokenizer import imageparts image_text, image_toks = crack_images(imageparts(eob)) print >> stream, "Image text:", repr(image_text) print >> stream, "Image tokens:", repr(image_toks) print >> stream # blank lines between messages except: print >> sys.stderr, "FAILED to convert message:", sub raise print >> stream, "Scored", num, "messages."
def Score(driver, manager, mapi_folder, subject, options, stream=None): num = 0 if options.all: getter = driver.GetAllItems getter_args = (mapi_folder,) else: getter = driver.GetItemsWithValue getter_args = (mapi_folder, PR_SUBJECT_A, subject) for item in getter(*getter_args): num += 1 if num % 1000 == 0: print >> sys.stderr, "Processed", num, "items..." hr, props = item.GetProps((PR_ENTRYID,PR_STORE_ENTRYID, PR_SUBJECT_A), 0) (tag, eid), (tag, store_eid), (tag, sub) = props eid = mapi.HexFromBin(eid) store_eid = mapi.HexFromBin(store_eid) try: msm = manager.message_store.GetMessage((store_eid, eid)) manager.classifier_data.message_db.load_msg(msm) score = manager.score(msm) if not options.quiet: print "Message %r scored %g" % (sub, score) if options.show_clues: clues = GetClues(manager, msm) if not options.quiet: print >> stream, clues if options.quiet: continue if options.show_image_info: eob = msm.GetEmailPackageObject() # Show what the OCR managed to extract. from spambayes.ImageStripper import crack_images from spambayes.tokenizer import imageparts image_text, image_toks = crack_images(imageparts(eob)) print >> stream, "Image text:", repr(image_text) print >> stream, "Image tokens:", repr(image_toks) print >> stream # blank lines between messages except: print >> sys.stderr, "FAILED to convert message:", sub raise print >> stream, "Scored", num, "messages."
def tokenize_body(self, msg): """Generate a stream of tokens from an email Message. If options['Tokenizer', 'check_octets'] is True, the first few undecoded characters of application/octet-stream parts of the message body become tokens. """ if options["Tokenizer", "check_octets"]: for part in octetparts(msg): try: text = part.get_payload(decode=True) except: yield "control: couldn't decode octet" text = part.get_payload(decode=False) if text is None: yield "control: octet payload is None" continue yield "octet:%s" % text[:options["Tokenizer", "octet_prefix_size"]] parts = imageparts(msg) if options["Tokenizer", "image_size"]: total_len = 0 for part in parts: try: text = part.get_payload(decode=True) except: yield "control: couldn't decode image" text = part.get_payload(decode=False) total_len += len(text or "") if text is None: yield "control: image payload is None" if total_len: yield "image-size:2**%d" % round(log2(total_len)) if options["Tokenizer", "crack_images"]: engine_name = options["Tokenizer", 'ocr_engine'] from spambayes.ImageStripper import crack_images text, tokens = crack_images(engine_name, parts) for t in tokens: yield t for t in self.tokenize_text(text): yield t for part in textparts(msg): try: text = part.get_payload(decode=True) except: yield "control: couldn't decode" text = part.get_payload(decode=False) if text is not None: text = try_to_repair_damaged_base64(text) if text is None: yield 'control: payload is None' continue text = numeric_entity_re.sub(numeric_entity_replacer, text) text = text.lower() if options["Tokenizer", "replace_nonascii_chars"]: text = text.translate(non_ascii_translate_tab) for t in find_html_virus_clues(text): yield "virus:%s" % t for cracker in (crack_uuencode, crack_urls, crack_html_style, crack_html_comment, crack_noframes): text, tokens = cracker(text) for t in tokens: yield t text = breaking_entity_re.sub(' ', text) text = html_re.sub('', text) for t in self.tokenize_text(text): yield t