def test_html_invalid_utf8_entity_encoded(self): """Test for invalid entity encoded chars""" samples = { "Valid ASCII": u"a", "Valid 2 Octet Sequence": u"쎱", "Invalid 2 Octet Sequence": u"쌨", "Invalid Sequence Identifier": u"ꂡ", "Valid 3 Octet Sequence": u"�", "Invalid 3 Octet Sequence (in 2nd Octet)": u"�", "Invalid 3 Octet Sequence (in 3rd Octet)": u"�", "Valid 4 Octet Sequence": u"�", "Invalid 4 Octet Sequence (in 2nd Octet)": u"�", "Invalid 4 Octet Sequence (in 3rd Octet)": u"�", "Invalid 4 Octet Sequence (in 4th Octet)": u"�", "Valid 5 Octet Sequence (but not Unicode!)": u" � ", "Valid 6 Octet Sequence (but not Unicode!)": u" � ", "Invalid unicode FFFE": u"", "Invalid unicode FFFF": u"", } for desc, sample in samples.iteritems(): try: htmldecode(sample) except Exception as e: msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".' self.assertTrue(False, msg % (e, desc))
def test_html_invalid_utf8_entity_encoded(self): '''Test for invalid entity encoded chars''' samples = { 'Valid ASCII': u"a", 'Valid 2 Octet Sequence': u"쎱", 'Invalid 2 Octet Sequence': u"쌨", 'Invalid Sequence Identifier': u"ꂡ", 'Valid 3 Octet Sequence': u"�", 'Invalid 3 Octet Sequence (in 2nd Octet)': u"�", 'Invalid 3 Octet Sequence (in 3rd Octet)': u"�", 'Valid 4 Octet Sequence': u"�", 'Invalid 4 Octet Sequence (in 2nd Octet)': u"�", 'Invalid 4 Octet Sequence (in 3rd Octet)': u"�", 'Invalid 4 Octet Sequence (in 4th Octet)': u"�", 'Valid 5 Octet Sequence (but not Unicode!)': u" � ", 'Valid 6 Octet Sequence (but not Unicode!)': u" � ", 'Invalid unicode FFFE': u"", 'Invalid unicode FFFF': u"", } for desc, sample in samples.iteritems(): try: htmldecode(sample) except Exception as e: msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".' self.assertTrue(False, msg % (e, desc))
def html_unescape(t): '''Decoder doing HTML unescaping. >>> encode_decode.htmldecode('<script>') u'<script>' >>> ''' return encode_decode.htmldecode(t)
def html_unescape(t): '''Decoder doing HTML unescaping. >>> encode_decode.htmldecode('<script>') '<script>' >>> ''' return encode_decode.htmldecode(t)
def findEmails( self , documentString ): ''' @return: A list with all mail users that are present in the documentString. Init, >>> from core.data.url.httpResponse import httpResponse as httpResponse >>> u = url_object('http://www.w3af.com/') >>> response = httpResponse( 200, '', {}, u, u ) >>> a = abstractParser(response) First test, no emails. >>> a.findEmails( '' ) [] >>> a = abstractParser(response) >>> a.findEmails( ' [email protected] ' ) ['*****@*****.**'] >>> a = abstractParser(response) >>> a.findEmails( '<a href="mailto:[email protected]">test</a>' ) ['*****@*****.**'] >>> a = abstractParser(response) >>> a.findEmails( '<a href="mailto:[email protected]">[email protected]</a>' ) ['*****@*****.**'] >>> a = abstractParser(response) >>> a.findEmails( '<a href="mailto:[email protected]">[email protected]</a>' ) ['*****@*****.**', '*****@*****.**'] >>> a = abstractParser(response) >>> a.findEmails( 'header [email protected] footer' ) ['*****@*****.**'] >>> a = abstractParser(response) >>> a.findEmails( 'header [email protected] footer' ) ['*****@*****.**'] ''' # First, we decode all chars. I have found some strange sites where they encode the @... some other # sites where they encode the email, or add some %20 padding... strange stuff... so better be safe... documentString = urllib.unquote_plus( documentString ) # Now we decode the HTML special characters... documentString = htmldecode( documentString ) # Perform a fast search for the @. In w3af, if we don't have an @ we don't have an email # We don't support mails like myself <at> gmail !dot! com if documentString.find('@') != -1: documentString = re.sub( '[^\w@\-\\.]', ' ', documentString ) # NOTE: emailRegex is also used in pks search engine. # Now we have a clean documentString; and we can match the mail addresses! emailRegex = '([A-Z0-9\._%-]{1,45}@([A-Z0-9\.-]{1,45}\.){1,10}[A-Z]{2,4})' for email, domain in re.findall(emailRegex, documentString, re.IGNORECASE): if email not in self._emails: self._emails.append( email ) return self._emails
def test_bug_trigger_case01(self): ''' u'í'.decode('utf-8') UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in position 9745: ordinal not in range(128) ''' html = u'Aquí encontrará' self.assertEqual(htmldecode(html), html)
def test_bug_trigger_case01(self): """ u'í'.decode('utf-8') UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in position 9745: ordinal not in range(128) """ html = u"Aquí encontrará" self.assertEqual(htmldecode(html), html)
def _findEmails(self, doc_str): ''' @return: A list with all mail users that are present in the doc_str. Init, >>> from core.data.url.httpResponse import httpResponse as httpResponse >>> u = url_object('http://www.w3af.com/') >>> response = httpResponse( 200, '', {}, u, u ) >>> a = BaseParser(response) First test, no emails. >>> a._findEmails( '' ) [] >>> a = BaseParser(response) >>> a._findEmails(u' [email protected] ') [u'*****@*****.**'] >>> a = BaseParser(response) >>> a._findEmails(u'<a href="mailto:[email protected]">test</a>') [u'*****@*****.**'] >>> a = BaseParser(response) >>> a._findEmails(u'<a href="mailto:[email protected]">[email protected]</a>') [u'*****@*****.**'] >>> a = BaseParser(response) >>> a._findEmails(u'<a href="mailto:[email protected]">[email protected]</a>') [u'*****@*****.**', u'*****@*****.**'] >>> a = BaseParser(response) >>> a._findEmails(u'header [email protected] footer') [u'*****@*****.**'] >>> a = BaseParser(response) >>> a._findEmails(u'header [email protected] footer') [u'*****@*****.**'] ''' # Revert url-encoded sub-strings doc_str = urllib.unquote_plus(doc_str) # Then html-decode HTML special characters doc_str = htmldecode(doc_str) # Perform a fast search for the @. In w3af, if we don't have an @ we # don't have an email # We don't support mails like myself <at> gmail !dot! com if doc_str.find('@') != -1: compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE) doc_str = re.sub(compiled_re, ' ', doc_str) for email, domain in re.findall(self.EMAIL_RE, doc_str): if email not in self._emails: self._emails.append(email) return self._emails
def _extract_emails(self, doc_str): ''' :return: A set() with all mail users that are present in the doc_str. @see: We don't support emails like myself <at> gmail !dot! com ''' # Revert url-encoded sub-strings doc_str = urllib.unquote_plus(doc_str) # Then html-decode HTML special characters doc_str = htmldecode(doc_str) self._emails = set() # Perform a fast search for the @. In w3af, if we don't have an @ we # don't have an email. if doc_str.find('@') != -1: compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE) doc_str = re.sub(compiled_re, ' ', doc_str) for email, domain in re.findall(self.EMAIL_RE, doc_str): self._emails.add(email) return self._emails
def findEmails( self , documentString ): ''' @return: A list with all mail users that are present in the documentString. ''' # First, we decode all chars. I have found some strange sites where they encode the @... some other # sites where they encode the email, or add some %20 padding... strange stuff... so better be safe... documentString = urllib.unquote_plus( documentString ) # Now we decode the html special characters... documentString = htmldecode( documentString ) # Perform a fast search for the @. In w3af, if we don't have an @ we don't have an email # We don't support mails like myself <at> gmail !dot! com if documentString.find('@') != -1: documentString = re.sub( '[^\w@\\.]', ' ', documentString ) # NOTE: emailRegex is also used in pks search engine. # Now we have a clean documentString; and we can match the mail addresses! emailRegex = '([A-Z0-9\._%-]{1,45}@([A-Z0-9\.-]{1,45}\.){1,10}[A-Z]{2,4})' for email, domain in re.findall(emailRegex, documentString, re.IGNORECASE): if email not in self._emails: self._emails.append( email ) return self._emails
def test_bug_trigger_case02(self): html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1' html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8') self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
def test_html_encoded(self): self.assertEqual(htmldecode(u'á'), u'á')
def test_charref(self): self.assertEqual(htmldecode(u'hola mundo A'), u'hola mundo A')
def test_special_char(self): self.assertEqual(htmldecode(u'hola ƻ'), u'hola ƻ')
def test_simple(self): self.assertEqual(htmldecode("hola mundo"), "hola mundo")
def test_special_char(self): self.assertEqual(htmldecode(u"hola ƻ"), u"hola ƻ")
def test_charref(self): self.assertEqual(htmldecode(u"hola mundo A"), u"hola mundo A")
def test_bug_trigger_case04(self): html = u"\xed" self.assertEqual(htmldecode(html), html)
def test_bug_trigger_case02(self): html_utf8_raw = "Aqu\xc3\xad encontrar\xc3\xa1" html_unicode = "Aqu\xc3\xad encontrar\xc3\xa1".decode("utf-8") self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
def test_bug_trigger_case04(self): html = u'\xed' self.assertEqual(htmldecode(html), html)
def test_tilde(self): self.assertEqual(htmldecode(u"hólá múndó"), u"hólá múndó")
def test_simple(self): self.assertEqual(htmldecode('hola mundo'), 'hola mundo')
def test_tilde(self): self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')
def test_html_encoded(self): self.assertEqual(htmldecode(u"á"), u"á")