Python SuspectFilter.strip_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: fuglu.shared

클래스/타입: SuspectFilter

메소드/함수: strip_text

hotexamples.com에서의 예제들: 4

Python SuspectFilter.strip_text - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 fuglu.shared.SuspectFilter.strip_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

SuspectFilter(13)

matches(6)

get_field(3)

get_args(2)

lint(2)

strip_text(2)

예제 #1

파일 보기

class FuzorDigest(object):
    def __init__(self, msg):
        self.debug = []
        self.digest = None
        self.predigest = None
        self.bodytext_size = 0
        self.filter = SuspectFilter(None)
        self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest')
        
        # digest config
        self.LONG_WORD_THRESHOLD = 10  # what is considered a long word
        self.REPLACE_LONG_WORD = '[LONG]'  # Replace long words in pre-digest with... None to disable
        self.REPLACE_EMAIL = '[EMAIL]'  # Replace email addrs in pre-digest with... None to disable
        self.REPLACE_URL = '[LINK]'  # Replace urls in pre-digest with... None to disable
        self.INCLUDE_ATTACHMENT_CONTENT = False  # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems)
        self.INCLUDE_ATTACHMENT_COUNT = True  # should the number of non-text-attachments be included in the digest
        self.MINIMUM_PREDIGEST_SIZE = 27  # if the predigest is smaller than this, ignore this message
        self.MINIMUM_UNMODIFIED_CONTENT = 27  # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet')
        self.MINIMUM_BODYTEXT_SIZE = 27  # if the body text content is smaller than this, ignore this message
        self.STRIP_WHITESPACE = True  # remove all whitespace from the pre-digest
        self.STRIP_HTML_MARKUP = True  # remove html tags (but keep content)
        self.REMOVE_HTML_TAGS = [
            'script',
            'style']  # strip tags (including content)
        
        self.predigest = self._make_predigest(msg)
        self.digest = self._make_hash(self.predigest)
    
    
    
    def _make_hash(self, predigest):
        if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE:
            return None
        predigest = predigest.strip()
        if len(predigest) < self.MINIMUM_PREDIGEST_SIZE:
            return None
        unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest)
        if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT:
            return None
        
        predigest = predigest.encode('utf-8', errors='ignore')
        return hashlib.sha1(predigest).hexdigest()
    
    
    
    def _handle_text_part(self, part):
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        errors = "ignore"
        if not charset:
            charset = "ascii"
        elif charset.lower().replace("_", "-") in ("quopri-codec", "quopri", "quoted-printable", "quotedprintable"):
            errors = "strict"
        
        try:
            payload = payload.decode(charset, errors)
        except (LookupError, UnicodeError, AssertionError):
            payload = payload.decode("ascii", "ignore")
        
        if self.STRIP_HTML_MARKUP:
            payload = self.filter.strip_text(
                payload,
                remove_tags=self.REMOVE_HTML_TAGS,
                use_bfs=True)
        
        if self.REPLACE_EMAIL is not None:
            payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload)
        
        if self.REPLACE_URL is not None:
            payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload)
        
        if self.REPLACE_LONG_WORD is not None:
            patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD
            payload = re.sub(patt, self.REPLACE_LONG_WORD, payload)
        
        if self.STRIP_WHITESPACE:
            payload = re.sub(r'\s', '', payload)
        payload = payload.strip()
        return payload
    
    
    
    def _make_predigest(self, msg):
        attachment_count = 0
        predigest = ''
        for part in msg.walk():
            if part.is_multipart():
                continue
            
            if part.get_content_maintype() == "text":
                try:
                    normalized_text_part = self._handle_text_part(part)
                    predigest += normalized_text_part
                    self.bodytext_size += len(normalized_text_part)
                except Exception as e:
                    self.logger.warn(e)
            else:
                attachment_count += 1
                if self.INCLUDE_ATTACHMENT_CONTENT:
                    predigest += "[ATTH:%s]" % hashlib.sha1(
                        part.get_payload()).hexdigest()
        
        if self.INCLUDE_ATTACHMENT_COUNT and attachment_count:
            predigest += "[ATTC:%s]" % attachment_count
        
        if self.STRIP_WHITESPACE:
            predigest = re.sub(r'\s', '', predigest)
        
        return predigest

예제 #2

파일 보기

파일: fuzor.py 프로젝트: gryphius/fuglu-extra-plugins

class FuzorDigest(object):

    def __init__(self, msg):
        self.debug = []
        self.digest = None
        self.predigest = None
        self.bodytext_size = 0
        self.filter = SuspectFilter(None)
        self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest')

        # digest config
        self.LONG_WORD_THRESHOLD = 10  # what is considered a long word
        self.REPLACE_LONG_WORD = '[LONG]'  # Replace long words in pre-digest with... None to disable
        self.REPLACE_EMAIL = '[EMAIL]'  # Replace email addrs in pre-digest with... None to disable
        self.REPLACE_URL = '[LINK]'  # Replace urls in pre-digest with... None to disable
        self.INCLUDE_ATTACHMENT_CONTENT = False  # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems)
        self.INCLUDE_ATTACHMENT_COUNT = True  # should the number of non-text-attachments be included in the digest
        self.MINIMUM_PREDIGEST_SIZE = 27  # if the predigest is smaller than this, ignore this message
        self.MINIMUM_UNMODIFIED_CONTENT = 27  # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet')
        self.MINIMUM_BODYTEXT_SIZE = 27  # if the body text content is smaller than this, ignore this message
        self.STRIP_WHITESPACE = True  # remove all whitespace from the pre-digest
        self.STRIP_HTML_MARKUP = True  # remove html tags (but keep content)
        self.REMOVE_HTML_TAGS = [
            'script',
            'style']  # strip tags (including content)

        self.predigest = self._make_predigest(msg)
        self.digest = self._make_hash(self.predigest)



    def _make_hash(self, predigest):
        if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE:
            return None
        predigest = predigest.strip()
        if isinstance(predigest, unicode):
            predigest = predigest.encode('utf-8', 'ignore')
        if len(predigest) < self.MINIMUM_PREDIGEST_SIZE:
            return None
        unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest)
        if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT:
            return None
        try:
            return hashlib.sha1(predigest).hexdigest()
        except:
            return None



    def _handle_text_part(self, part):
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        errors = "ignore"
        if not charset:
            charset = "ascii"
        elif (charset.lower().replace("_", "-") in ("quopri-codec",
              "quopri", "quoted-printable", "quotedprintable")):
            errors = "strict"

        try:
            payload = payload.decode(charset, errors)
        except (LookupError, UnicodeError, AssertionError):
            payload = payload.decode("ascii", "ignore")

        if self.STRIP_HTML_MARKUP:
            payload = self.filter.strip_text(
                payload,
                remove_tags=self.REMOVE_HTML_TAGS,
                use_bfs=True)

        if self.REPLACE_EMAIL is not None:
            payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload)

        if self.REPLACE_URL is not None:
            payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload)

        if self.REPLACE_LONG_WORD is not None:
            patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD
            payload = re.sub(patt, self.REPLACE_LONG_WORD, payload)

        if self.STRIP_WHITESPACE:
            payload = re.sub(r'\s', '', payload)
        payload = payload.strip()
        return payload



    def _make_predigest(self, msg):
        attachment_count = 0
        predigest = ''
        for part in msg.walk():
            if part.is_multipart():
                continue

            if part.get_content_maintype() == "text":
                try:
                    normalized_text_part = self._handle_text_part(part)
                    predigest += normalized_text_part
                    self.bodytext_size += len(normalized_text_part)
                except Exception as e:
                    self.logger.warn(e)
            else:
                attachment_count += 1
                if self.INCLUDE_ATTACHMENT_CONTENT:
                    predigest += "[ATTH:%s]" % hashlib.sha1(
                        part.get_payload()).hexdigest()

        if self.INCLUDE_ATTACHMENT_COUNT and attachment_count:
            predigest += "[ATTC:%s]" % attachment_count

        if self.STRIP_WHITESPACE:
            predigest = re.sub(r'\s', '', predigest)

        return predigest

예제 #3

파일 보기

파일: shared_test.py 프로젝트: jahlives/fuglu

class SuspectFilterTestCase(unittest.TestCase):
    """Test Suspectfilter"""
    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue('Sent to unittest domain!' in headermatches,
                        "To_domain not found in headercheck")
        self.assertTrue(
            'Envelope sender is [email protected]' in headermatches,
            "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue('A tag match' in headermatches,
                        "Tag match did not work")
        self.assertTrue('Globbing works' in headermatches,
                        "header globbing failed")
        self.assertTrue('body rule works' in headermatches,
                        "decoded body rule failed")
        self.assertTrue('full body rule works' in headermatches,
                        "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse(
            'this should not match in a body rule' in headermatches,
            'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue(
            'perl-style recipient match' in headermatches,
            "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(declarationtest,
                                                    use_bfs=use_bfs)
            self.assertEqual(docstripped.split(),
                             ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(wordhtml,
                                                        use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')

예제 #4

파일 보기

파일: shared_test.py 프로젝트: steigr/fuglu

class SuspectFilterTestCase(unittest.TestCase):

    """Test Header Filter"""

    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue(
            'Sent to unittest domain!' in headermatches, "To_domain not found in headercheck")
        self.assertTrue('Envelope sender is [email protected]' in headermatches,
                        "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue(
            'A tag match' in headermatches, "Tag match did not work")
        self.assertTrue(
            'Globbing works' in headermatches, "header globbing failed")
        self.assertTrue(
            'body rule works' in headermatches, "decoded body rule failed")
        self.assertTrue(
            'full body rule works' in headermatches, "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse('this should not match in a body rule' in headermatches,
                         'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue('perl-style recipient match' in headermatches,
                        "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(
                declarationtest, use_bfs=use_bfs)
            self.assertEqual(
                docstripped.split(), ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(
                wordhtml, use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')