class FuzorDigest(object): def __init__(self, msg): self.debug = [] self.digest = None self.predigest = None self.bodytext_size = 0 self.filter = SuspectFilter(None) self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest') # digest config self.LONG_WORD_THRESHOLD = 10 # what is considered a long word self.REPLACE_LONG_WORD = '[LONG]' # Replace long words in pre-digest with... None to disable self.REPLACE_EMAIL = '[EMAIL]' # Replace email addrs in pre-digest with... None to disable self.REPLACE_URL = '[LINK]' # Replace urls in pre-digest with... None to disable self.INCLUDE_ATTACHMENT_CONTENT = False # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems) self.INCLUDE_ATTACHMENT_COUNT = True # should the number of non-text-attachments be included in the digest self.MINIMUM_PREDIGEST_SIZE = 27 # if the predigest is smaller than this, ignore this message self.MINIMUM_UNMODIFIED_CONTENT = 27 # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet') self.MINIMUM_BODYTEXT_SIZE = 27 # if the body text content is smaller than this, ignore this message self.STRIP_WHITESPACE = True # remove all whitespace from the pre-digest self.STRIP_HTML_MARKUP = True # remove html tags (but keep content) self.REMOVE_HTML_TAGS = [ 'script', 'style'] # strip tags (including content) self.predigest = self._make_predigest(msg) self.digest = self._make_hash(self.predigest) def _make_hash(self, predigest): if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE: return None predigest = predigest.strip() if len(predigest) < self.MINIMUM_PREDIGEST_SIZE: return None unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest) if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT: return None predigest = predigest.encode('utf-8', errors='ignore') return hashlib.sha1(predigest).hexdigest() def _handle_text_part(self, part): payload = part.get_payload(decode=True) charset = part.get_content_charset() errors = "ignore" if not charset: charset = "ascii" elif charset.lower().replace("_", "-") in ("quopri-codec", "quopri", "quoted-printable", "quotedprintable"): errors = "strict" try: payload = payload.decode(charset, errors) except (LookupError, UnicodeError, AssertionError): payload = payload.decode("ascii", "ignore") if self.STRIP_HTML_MARKUP: payload = self.filter.strip_text( payload, remove_tags=self.REMOVE_HTML_TAGS, use_bfs=True) if self.REPLACE_EMAIL is not None: payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload) if self.REPLACE_URL is not None: payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload) if self.REPLACE_LONG_WORD is not None: patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD payload = re.sub(patt, self.REPLACE_LONG_WORD, payload) if self.STRIP_WHITESPACE: payload = re.sub(r'\s', '', payload) payload = payload.strip() return payload def _make_predigest(self, msg): attachment_count = 0 predigest = '' for part in msg.walk(): if part.is_multipart(): continue if part.get_content_maintype() == "text": try: normalized_text_part = self._handle_text_part(part) predigest += normalized_text_part self.bodytext_size += len(normalized_text_part) except Exception as e: self.logger.warn(e) else: attachment_count += 1 if self.INCLUDE_ATTACHMENT_CONTENT: predigest += "[ATTH:%s]" % hashlib.sha1( part.get_payload()).hexdigest() if self.INCLUDE_ATTACHMENT_COUNT and attachment_count: predigest += "[ATTC:%s]" % attachment_count if self.STRIP_WHITESPACE: predigest = re.sub(r'\s', '', predigest) return predigest
class FuzorDigest(object): def __init__(self, msg): self.debug = [] self.digest = None self.predigest = None self.bodytext_size = 0 self.filter = SuspectFilter(None) self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest') # digest config self.LONG_WORD_THRESHOLD = 10 # what is considered a long word self.REPLACE_LONG_WORD = '[LONG]' # Replace long words in pre-digest with... None to disable self.REPLACE_EMAIL = '[EMAIL]' # Replace email addrs in pre-digest with... None to disable self.REPLACE_URL = '[LINK]' # Replace urls in pre-digest with... None to disable self.INCLUDE_ATTACHMENT_CONTENT = False # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems) self.INCLUDE_ATTACHMENT_COUNT = True # should the number of non-text-attachments be included in the digest self.MINIMUM_PREDIGEST_SIZE = 27 # if the predigest is smaller than this, ignore this message self.MINIMUM_UNMODIFIED_CONTENT = 27 # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet') self.MINIMUM_BODYTEXT_SIZE = 27 # if the body text content is smaller than this, ignore this message self.STRIP_WHITESPACE = True # remove all whitespace from the pre-digest self.STRIP_HTML_MARKUP = True # remove html tags (but keep content) self.REMOVE_HTML_TAGS = [ 'script', 'style'] # strip tags (including content) self.predigest = self._make_predigest(msg) self.digest = self._make_hash(self.predigest) def _make_hash(self, predigest): if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE: return None predigest = predigest.strip() if isinstance(predigest, unicode): predigest = predigest.encode('utf-8', 'ignore') if len(predigest) < self.MINIMUM_PREDIGEST_SIZE: return None unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest) if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT: return None try: return hashlib.sha1(predigest).hexdigest() except: return None def _handle_text_part(self, part): payload = part.get_payload(decode=True) charset = part.get_content_charset() errors = "ignore" if not charset: charset = "ascii" elif (charset.lower().replace("_", "-") in ("quopri-codec", "quopri", "quoted-printable", "quotedprintable")): errors = "strict" try: payload = payload.decode(charset, errors) except (LookupError, UnicodeError, AssertionError): payload = payload.decode("ascii", "ignore") if self.STRIP_HTML_MARKUP: payload = self.filter.strip_text( payload, remove_tags=self.REMOVE_HTML_TAGS, use_bfs=True) if self.REPLACE_EMAIL is not None: payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload) if self.REPLACE_URL is not None: payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload) if self.REPLACE_LONG_WORD is not None: patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD payload = re.sub(patt, self.REPLACE_LONG_WORD, payload) if self.STRIP_WHITESPACE: payload = re.sub(r'\s', '', payload) payload = payload.strip() return payload def _make_predigest(self, msg): attachment_count = 0 predigest = '' for part in msg.walk(): if part.is_multipart(): continue if part.get_content_maintype() == "text": try: normalized_text_part = self._handle_text_part(part) predigest += normalized_text_part self.bodytext_size += len(normalized_text_part) except Exception as e: self.logger.warn(e) else: attachment_count += 1 if self.INCLUDE_ATTACHMENT_CONTENT: predigest += "[ATTH:%s]" % hashlib.sha1( part.get_payload()).hexdigest() if self.INCLUDE_ATTACHMENT_COUNT and attachment_count: predigest += "[ATTC:%s]" % attachment_count if self.STRIP_WHITESPACE: predigest = re.sub(r'\s', '', predigest) return predigest
class SuspectFilterTestCase(unittest.TestCase): """Test Suspectfilter""" def setUp(self): self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex') def tearDown(self): pass def test_sf_get_args(self): """Test SuspectFilter files""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') suspect.tags['testtag'] = 'testvalue' headermatches = self.candidate.get_args(suspect) self.assertTrue('Sent to unittest domain!' in headermatches, "To_domain not found in headercheck") self.assertTrue( 'Envelope sender is [email protected]' in headermatches, "Envelope Sender not matched in header chekc") self.assertTrue('Mime Version is 1.0' in headermatches, "Standard header Mime Version not found") self.assertTrue('A tag match' in headermatches, "Tag match did not work") self.assertTrue('Globbing works' in headermatches, "header globbing failed") self.assertTrue('body rule works' in headermatches, "decoded body rule failed") self.assertTrue('full body rule works' in headermatches, "full body failed") self.assertTrue('mime rule works' in headermatches, "mime rule failed") self.assertFalse( 'this should not match in a body rule' in headermatches, 'decoded body rule matched raw body') # perl style advanced rules self.assertTrue('perl-style /-notation works!' in headermatches, "new rule format failed: %s" % headermatches) self.assertTrue( 'perl-style recipient match' in headermatches, "new rule format failed for to_domain: %s" % headermatches) self.assertFalse('this should not match' in headermatches, "rule flag ignorecase was not detected") # TODO: raw body rules def test_sf_matches(self): """Test SuspectFilter extended matches""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') (match, info) = self.candidate.matches(suspect, extended=True) self.assertTrue(match, 'Match should return True') field, matchedvalue, arg, regex = info self.assertTrue(field == 'to_domain') self.assertTrue(matchedvalue == 'unittests.fuglu.org') self.assertTrue(arg == 'Sent to unittest domain!') self.assertTrue(regex == 'unittests\.fuglu\.org') def test_sf_get_field(self): """Test SuspectFilter field extract""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') # additional field tests self.assertEqual( self.candidate.get_field(suspect, 'clienthelo')[0], 'helo1') self.assertEqual( self.candidate.get_field(suspect, 'clientip')[0], '10.0.0.1') self.assertEqual( self.candidate.get_field(suspect, 'clienthostname')[0], 'rdns1') def test_strip(self): html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz""" declarationtest = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de"> <head> <title>greetings</title> </head> <body> <font color="red">well met!</font> </body> </html> """ # word generated empty message wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml" xmlns:o=3D"urn:schemas-microsoft-com:office:office" xmlns:w=3D"urn:schemas-microsoft-com:office:word" xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml" xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html; charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.E-MailFormatvorlage17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:70.85pt 70.85pt 2.0cm 70.85pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext=3D"edit"> <o:idmap v:ext=3D"edit" data=3D"1" /> </o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p class=3DMsoNormal><o:p> </o:p></p></div></body></html>""" for use_bfs in [True, False]: stripped = self.candidate.strip_text(html, use_bfs=use_bfs) self.assertEqual(stripped, 'foobarbaz') docstripped = self.candidate.strip_text(declarationtest, use_bfs=use_bfs) self.assertEqual(docstripped.split(), ['greetings', 'well', 'met!']) wordhtmstripped = self.candidate.strip_text(wordhtml, use_bfs=use_bfs) self.assertEqual(wordhtmstripped.strip(), '')
class SuspectFilterTestCase(unittest.TestCase): """Test Header Filter""" def setUp(self): self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex') def tearDown(self): pass def test_sf_get_args(self): """Test SuspectFilter files""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') suspect.tags['testtag'] = 'testvalue' headermatches = self.candidate.get_args(suspect) self.assertTrue( 'Sent to unittest domain!' in headermatches, "To_domain not found in headercheck") self.assertTrue('Envelope sender is [email protected]' in headermatches, "Envelope Sender not matched in header chekc") self.assertTrue('Mime Version is 1.0' in headermatches, "Standard header Mime Version not found") self.assertTrue( 'A tag match' in headermatches, "Tag match did not work") self.assertTrue( 'Globbing works' in headermatches, "header globbing failed") self.assertTrue( 'body rule works' in headermatches, "decoded body rule failed") self.assertTrue( 'full body rule works' in headermatches, "full body failed") self.assertTrue('mime rule works' in headermatches, "mime rule failed") self.assertFalse('this should not match in a body rule' in headermatches, 'decoded body rule matched raw body') # perl style advanced rules self.assertTrue('perl-style /-notation works!' in headermatches, "new rule format failed: %s" % headermatches) self.assertTrue('perl-style recipient match' in headermatches, "new rule format failed for to_domain: %s" % headermatches) self.assertFalse('this should not match' in headermatches, "rule flag ignorecase was not detected") # TODO: raw body rules def test_sf_matches(self): """Test SuspectFilter extended matches""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') (match, info) = self.candidate.matches(suspect, extended=True) self.assertTrue(match, 'Match should return True') field, matchedvalue, arg, regex = info self.assertTrue(field == 'to_domain') self.assertTrue(matchedvalue == 'unittests.fuglu.org') self.assertTrue(arg == 'Sent to unittest domain!') self.assertTrue(regex == 'unittests\.fuglu\.org') def test_sf_get_field(self): """Test SuspectFilter field extract""" suspect = Suspect('*****@*****.**', '*****@*****.**', TESTDATADIR + '/helloworld.eml') # additional field tests self.assertEqual(self.candidate.get_field( suspect, 'clienthelo')[0], 'helo1') self.assertEqual(self.candidate.get_field( suspect, 'clientip')[0], '10.0.0.1') self.assertEqual(self.candidate.get_field( suspect, 'clienthostname')[0], 'rdns1') def test_strip(self): html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz""" declarationtest = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de"> <head> <title>greetings</title> </head> <body> <font color="red">well met!</font> </body> </html> """ # word generated empty message wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml" xmlns:o=3D"urn:schemas-microsoft-com:office:office" xmlns:w=3D"urn:schemas-microsoft-com:office:word" xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml" xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html; charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.E-MailFormatvorlage17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:70.85pt 70.85pt 2.0cm 70.85pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext=3D"edit"> <o:idmap v:ext=3D"edit" data=3D"1" /> </o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p class=3DMsoNormal><o:p> </o:p></p></div></body></html>""" for use_bfs in [True, False]: stripped = self.candidate.strip_text(html, use_bfs=use_bfs) self.assertEqual(stripped, 'foobarbaz') docstripped = self.candidate.strip_text( declarationtest, use_bfs=use_bfs) self.assertEqual( docstripped.split(), ['greetings', 'well', 'met!']) wordhtmstripped = self.candidate.strip_text( wordhtml, use_bfs=use_bfs) self.assertEqual(wordhtmstripped.strip(), '')