def _handle_textarea_tag_end(self, tag): """ Handler for textarea end tag """ SGMLParser._handle_textarea_tag_end(self, tag) if not self._text_area_tag_name: return if not self._text_area_data: return attrs = { 'name': self._text_area_tag_name, 'value': self._text_area_data, 'type': INPUT_TYPE_TEXTAREA } if not self._forms: self._saved_inputs.append(attrs) else: form_params = self._forms[-1] form_params.add_field_by_attrs(attrs) self._text_area_tag_name = None self._text_area_data = None
def _handle_select_tag_end(self, tag): """ Handler for select end tag """ SGMLParser._handle_select_tag_end(self, tag) if not self._forms: return if not self._select_input_name: return attrs = { 'name': self._select_input_name, 'values': list(self._select_option_values), 'type': INPUT_TYPE_SELECT } # Work with the last form form_params = self._forms[-1] form_params.add_field_by_attrs(attrs) # Reset selects container self._select_option_values = set() self._select_input_name = None
def test_extract_emails_mailto(self): body = u'<a href="mailto:[email protected]">test</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() expected_res = {u'*****@*****.**'} self.assertEqual(p.get_emails(), expected_res)
def test_mailto_ignored_in_links(self): body = u'<a href="mailto:[email protected]">a</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() parsed, _ = p.references self.assertEqual(parsed, [])
def close(self): """ Called by the parser when it ends """ SGMLParser.close(self) # Don't call clear() here! That would call clear() on SGMLParser and # remove all the forms, references, etc. self._html_internals_clear()
def _handle_textarea_tag_inside_form(self, tag, tag_name, attrs): """ Handler for textarea tag inside a form """ SGMLParser._handle_textarea_tag_start(self, tag, tag_name, attrs) # Set the data and name self._text_area_data = tag.text self._text_area_tag_name = get_value_by_key(attrs, 'name', 'id')
def test_mailto_subject_body(self): body = u'<a href="mailto:[email protected]?subject=testing out mailto'\ u'&body=Just testing">test</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() expected_res = {u'*****@*****.**'} self.assertEqual(p.get_emails(), expected_res)
def test_get_clear_text_body(self): html = 'header <b>ABC</b>-<b>DEF</b>-<b>XYZ</b> footer' clear_text = 'header ABC-DEF-XYZ footer' headers = Headers([('Content-Type', 'text/html')]) r = build_http_response(self.url, html, headers) p = SGMLParser(r) p.parse() self.assertEquals(clear_text, p.get_clear_text_body())
def test_meta_tags(self): body = HTML_DOC % \ {'head': META_REFRESH + META_REFRESH_WITH_URL, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertTrue(2, len(p.meta_redirs)) self.assertTrue("2;url=http://crawler.w3af.com/" in p.meta_redirs) self.assertTrue("600" in p.meta_redirs) self.assertEquals([URL('http://crawler.w3af.com/')], p.references[0])
def _handle_script_tag_start(self, tag, tag_name, attrs): """ Handle the script tags """ SGMLParser._handle_script_tag_start(self, tag, tag_name, attrs) if tag.text is not None: re_extract = ReExtract(tag.text.strip(), self._base_url, self._encoding) re_extract.parse() self._re_urls.update(re_extract.get_references())
def test_meta_tags_with_single_quotes(self): body = HTML_DOC % {'head': META_REFRESH + META_REFRESH_WITH_URL_AND_QUOTES, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertEqual(2, len(p.meta_redirs)) self.assertIn("2;url='http://crawler.w3af.com/'", p.meta_redirs) self.assertIn("600", p.meta_redirs) self.assertEqual([URL('http://crawler.w3af.com/')], p.references[0])
def test_nested_with_text(self): body = '<html><a href="/abc">foo<div>bar</div></a></html>' url = URL('http://www.w3af.com/') headers = Headers() headers['content-type'] = 'text/html' resp = HTTPResponse(200, body, headers, url, url, charset='utf-8') p = SGMLParser(resp) tags = p.get_tags_by_filter(('a', 'b'), yield_text=True) tags = list(tags) self.assertEqual([Tag('a', {'href': '/abc'}, 'foo')], tags)
def test_none(self): body = '<html><a href="/abc">foo<div>bar</div></a></html>' url = URL('http://www.w3af.com/') headers = Headers() headers['content-type'] = 'text/html' resp = HTTPResponse(200, body, headers, url, url, charset='utf-8') p = SGMLParser(resp) tags = p.get_tags_by_filter(None) tags = list(tags) tag_names = [tag.name for tag in tags] self.assertEqual(tag_names, ['html', 'body', 'a', 'div'])
def test_reference_with_colon(self): body = """ <html> <a href="d:url.html?id=13&subid=3">foo</a> </html>""" r = build_http_response(self.url, body) p = SGMLParser(r) p.parse() parsed_refs = p.references[0] # # Finding zero URLs is the correct behavior based on what # I've seen in Opera and Chrome. # self.assertEquals(0, len(parsed_refs))
def _handle_form_tag_start(self, tag, tag_name, attrs): """ Handle the form tags. This method also looks if there are "pending inputs" in the self._saved_inputs list and parses them. """ SGMLParser._handle_form_tag_start(self, tag, tag_name, attrs) method = attrs.get('method', 'GET').upper() action = attrs.get('action', None) form_encoding = attrs.get('enctype', DEFAULT_FORM_ENCODING) autocomplete = attrs.get('autocomplete', None) if action is None: action = self._source_url else: action = self._decode_url(action) try: action = self._base_url.url_join(action, encoding=self._encoding) except ValueError: # The URL in the action is invalid, the best thing we can do # is to guess, and our best guess is that the URL will be the # current one. action = self._source_url # Create the form object and store everything for later use form_params = FormParameters(encoding=self._encoding, method=method, action=action, form_encoding=form_encoding, attributes=attrs, hosted_at_url=self._source_url) form_params.set_autocomplete(autocomplete) self._forms.append(form_params) # Now I verify if there are any input tags that were found # outside the scope of a form tag for input_attrs in self._saved_inputs: # Parse them just like if they were found AFTER the # form tag opening self._handle_input_tag_inside_form(tag, 'input', input_attrs) # All parsed, remove them. self._saved_inputs = []
def test_parsed_references(self): # The *parsed* urls *must* come both from valid tags and tag attributes # Also invalid urls like must be ignored (like javascript instructions) body = """ <html> <a href="/x.py?a=1" Invalid_Attr="/invalid_url.php"> <form action="javascript:history.back(1)"> <tagX href="/py.py"/> </form> </html>""" r = build_http_response(self.url, body) p = SGMLParser(r) p.parse() parsed_refs = p.references[0] self.assertEquals(1, len(parsed_refs)) self.assertEquals( 'http://w3af.com/x.py?a=1', parsed_refs[0].url_string)
def test_get_clear_text_body_encodings(self): raise SkipTest('Not sure why this one is failing :S') for lang_desc, (body, encoding) in TEST_RESPONSES.iteritems(): encoding_header = 'text/html; charset=%s' % encoding headers = Headers([('Content-Type', encoding_header)]) encoded_body = body.encode(encoding) r = build_http_response(self.url, encoded_body, headers) p = SGMLParser(r) p.parse() ct_body = p.get_clear_text_body() # These test strings don't really have tags, so they should be eq self.assertEqual(ct_body, body)
def __init__(self, http_resp): # An internal list to be used to save input tags found # outside of the scope of a form tag. self._saved_inputs = [] # For <textarea> elems parsing self._text_area_tag_name = None self._text_area_data = None # Save for using in form parsing self._source_url = http_resp.get_url() self._re_urls = set() # For <select> and <option> parsing self._select_option_values = set() self._select_input_name = None # Call parent's __init__ SGMLParser.__init__(self, http_resp)
def test_case_sensitivity(self): """ Ensure handler methods are *always* called with lowered-cased tag and attribute names """ def islower(s): il = False if isinstance(s, basestring): il = s.islower() else: il = all(k.islower() for k in s) assert il, "'%s' is not lowered-case" % s return il def start_wrapper(orig_start, tag): islower(tag.tag) islower(tag.attrib) return orig_start(tag) tags = (A_LINK_ABSOLUTE, INPUT_CHECKBOX_WITH_NAME, SELECT_WITH_NAME, TEXTAREA_WITH_ID_AND_DATA, INPUT_HIDDEN) ops = "lower", "upper", "title" for indexes in combinations(range(len(tags)), 2): body_elems = [] for index, tag in enumerate(tags): ele = tag if index in indexes: ele = getattr(tag, choice(ops))() body_elems.append(ele) body = HTML_DOC % {'head': '', 'body': ''.join(body_elems)} resp = build_http_response(self.url, body) p = SGMLParser(resp) orig_start = p.start wrapped_start = partial(start_wrapper, orig_start) p.start = wrapped_start p.parse()
def _handle_select_tag_end(self, tag): """ Handler for select end tag """ SGMLParser._handle_select_tag_end(self, tag) if not self._forms: return if not self._select_input_name: return attrs = {'name': self._select_input_name, 'values': list(self._select_option_values), 'type': INPUT_TYPE_SELECT} # Work with the last form form_params = self._forms[-1] form_params.add_field_by_attrs(attrs) # Reset selects container self._select_option_values = set() self._select_input_name = None
def test_get_emails_filter(self): resp = build_http_response(self.url, '') p = SGMLParser(resp) p._emails = {'*****@*****.**', '*****@*****.**'} self.assertEqual(p.get_emails(), {'*****@*****.**', '*****@*****.**'}) self.assertEqual(p.get_emails(domain='w3af.com'), ['*****@*****.**']) self.assertEqual(p.get_emails(domain='not.com'), ['*****@*****.**'])
def _handle_textarea_tag_end(self, tag): """ Handler for textarea end tag """ SGMLParser._handle_textarea_tag_end(self, tag) if not self._text_area_tag_name: return if not self._text_area_data: return attrs = {'name': self._text_area_tag_name, 'value': self._text_area_data, 'type': INPUT_TYPE_TEXTAREA} if not self._forms: self._saved_inputs.append(attrs) else: form_params = self._forms[-1] form_params.add_field_by_attrs(attrs) self._text_area_tag_name = None self._text_area_data = None
def test_parser_attrs(self): body_content = HTML_DOC % {'head': '', 'body': ''} p = SGMLParser(build_http_response(self.url, body_content)) # Assert parser has these attrs correctly initialized self.assertFalse(getattr(p, '_inside_form')) self.assertFalse(getattr(p, '_inside_select')) self.assertFalse(getattr(p, '_inside_text_area')) self.assertFalse(getattr(p, '_inside_script')) self.assertEquals(set(), getattr(p, '_tag_and_url')) self.assertEquals([], getattr(p, '_forms')) self.assertEquals([], getattr(p, '_comments_in_doc')) self.assertEquals([], getattr(p, '_meta_redirs')) self.assertEquals([], getattr(p, '_meta_tags'))
def test_get_clear_text_issue_4402(self): """ :see: https://github.com/andresriancho/w3af/issues/4402 """ test_file_path = 'core/data/url/tests/data/encoding_4402.php' test_file = os.path.join(ROOT_PATH, test_file_path) body = file(test_file, 'rb').read() sample_encodings = [encoding for _, (_, encoding) in TEST_RESPONSES.iteritems()] sample_encodings.extend(['', 'utf-8']) for encoding in sample_encodings: encoding_header = 'text/html; charset=%s' % encoding headers = Headers([('Content-Type', encoding_header)]) r = build_http_response(self.url, body, headers) p = SGMLParser(r) p.parse() p.get_clear_text_body()
def __init__(self, http_response): self._select_tag_name = '' self._source_url = http_response.get_url() SGMLParser.__init__(self, http_response)
def test_baseurl(self): body = HTML_DOC % {'head': BASE_TAG, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertEquals(URL('http://www.w3afbase.com/'), p._base_url)
def test_extract_emails_blank(self): resp = build_http_response(self.url, '') p = SGMLParser(resp) self.assertEqual(p.get_emails(), set())