def test_hash_different_encoded_and_decoded_values(self): """Ensure encoded and decoded values have a different hash.""" queue = Queue(Options()) queue.add_request(Request("http://example.ltd?val={{aaaa}}")) queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D")) self.assertEqual(queue.count_total, 2)
def test_hash_different_query_order(self): """Ensure query parameters in different orders are treated as one queue item.""" queue = Queue(Options()) queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a")) queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c")) queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c")) self.assertEqual(queue.count_total, 1)
def test_hash_option_subdomain_must_not_match(self): """Ensure different subdomains are treated as one queue item if subdomains must match is False.""" options = Options() options.scope.subdomain_must_match = False queue = Queue(options) queue.add_request(Request("https://www.example.ltd")) queue.add_request(Request("https://webmail.example.ltd")) queue.add_request(Request("https://subdomain.example.ltd")) self.assertEqual(queue.count_total, 1)
def test_hash_option_protocol_must_not_match(self): """Ensure different protocols are treated as one queue item if protocols must match is False.""" options = Options() options.scope.protocol_must_match = False queue = Queue(options) queue.add_request(Request("https://example.ltd")) queue.add_request(Request("http://example.ltd")) queue.add_request(Request("ftp://example.ltd")) self.assertEqual(queue.count_total, 1)
def start(self): """Start the crawler.""" crawler = Crawler(self.__options) signal.signal(signal.SIGINT, self.__signal_handler) startpoint = Request(self.__args.domain) crawler.start_with(startpoint)
def crawl_url(self, url): """ :param url: url for crawling :return: list of crawled urls as result crawling """ self.crawled = [] self.crawler.start_with(Request(urlparse(url).scheme + "://" + urlparse(url).netloc)) return self.crawled
def test_regex_url_count(self): """Test if the amount of URLs found complies with the expected amount.""" html = "" for url in self.__urls: html += "\n" + url["test"] finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response())) matches = finder.get_requests_from_content(self.__host, html) self.assertEqual(len(matches), 30)
def crawl(self, url): if self.crawler is None: print "Cralwer is not setted up" return parsedurl = urlparse(url) domain = parsedurl.scheme + "://" + parsedurl.netloc self.links = [] self.crawler.start_with(Request(domain)) return self.links
def start(self): """Start the crawler.""" if self.__set_angular_version(): crawler = Crawler(self.__options) signal.signal(signal.SIGINT, self.__signal_handler) startpoint = Request(self.__args.domain) crawler.start_with(startpoint) # Exit the process with the correct status code sys.exit(not self.__vulnerable_items)
def test_regex_url_matches(self): """Test if all the URLs match the found URLs.""" for url in self.__urls: finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response())) requests = finder.get_requests_from_content(self.__host, url["test"]) if url["must_pass"]: self.assertEqual(len(requests), 1) self.assertEqual(requests[0].url, url["url"]) else: self.assertEqual(len(requests), 0)
def test_hash_is_always_the_same(self): """Ensure the hashes are calculated correctly by checking for duplicates in the queue.""" options = Options() queue = Queue(options) for index in range(0, 100): request = Request("https://example.ltd?1=1#2=2") HTTPRequestHelper.patch_with_options(request, options) request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd') queue.add_request(request) self.assertEqual(queue.count_total, 1)
def get_requests_from_content(self, host, content): """Find new requests from the given content. Args: host (str): The parent request URL. content (obj): The HTML content. Returns: list(obj): Requests that were found. """ soup = self.__queue_item.get_soup_response() a_elements = soup.find_all("a", href=True) link_elements = soup.find_all("link", href=True) script_elements = soup.find_all("script", src=True) found_requests = [] for a_element in a_elements: found_url = self.__trim_grave_accent(a_element["href"]) if not URLHelper.is_mailto(found_url): absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) for link_element in link_elements: found_url = self.__trim_grave_accent(link_element["href"]) if not URLHelper.is_mailto(found_url): absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) for script_element in script_elements: found_url = self.__trim_grave_accent(script_element["src"]) if not URLHelper.is_mailto(found_url): absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) return found_requests
def test_crawl_website(self): """Crawl the website in `test/` and check if the count is correct.""" if not self.travis: print("\n\nPlease note that the 'TestSite' unit test did not run.") print("It will only run in Travis since it needs a webserver.") return options = Options() options.callbacks.crawler_after_finish crawler = Crawler(options) crawler.start_with(Request("http://localhost/")) self.assertEqual(crawler.queue.count_total, 16)
def test_soup_url_count(self): """Test if the amount of URLs found complies with the expected amount.""" html = "" for url in self.__urls: html += "\n" + url["test"] request = Request(self.__host) response = Response() response.text = html finder = HTMLSoupFormScraper(Options(), QueueItem(request, response)) matches = finder.get_requests() self.assertEqual(len(matches), 4)
def test_soup_url_matches(self): """Test if all the URLs match the found URLs.""" for url in self.__urls: request = Request(self.__host) response = Response() response.text = url["test"] finder = SoupFormScraper(Options(), QueueItem(request, response)) requests = finder.get_requests() if url["must_pass"]: self.assertEqual(requests[0].url, url["url"]) self.assertEqual(len(requests), 1) else: self.assertEqual(len(requests), 0)
def test_version_detect(self): """Check if a single (stable) AngularJS version is detected by ACSTIS.""" server = LocalAngularServer() server.start(LocalAngularServer.HANDLER_VULNERABLE_TEST, {"asset": "https://code.angularjs.org/1.5.8/angular.min.js"}) domain = "http://" + server.url + "?vulnerable=payload" version = BrowserHelper.javascript( QueueItem(Request(domain), Response(domain)), "return angular.version.full" ) server.stop() self.assertEqual("1.5.8", version)
def __get_request(self, host, soup): """Build a request from the given soup form. Args: host str: The URL of the current queue item. soup (obj): The BeautifulSoup form. Returns: :class:`nyawc.http.Request`: The new Request. """ url = URLHelper.make_absolute(host, self.__trim_grave_accent(soup["action"])) if soup.has_attr("action") else host method_original = soup["method"] if soup.has_attr("method") else "get" method = "post" if method_original.lower() == "post" else "get" data = self.__get_form_data(soup) return Request(url, method, data)
def derived_get_requests(self): """Get all the new requests that were found in the response. Returns: list(:class:`nyawc.http.Request`): A list of new requests that were found. """ attributes = { "src": True, "href": True, "link": True, "script": True, "url": True } host = self.queue_item.response.url soup = self.queue_item.get_soup_response() base_element = soup.find("base", href=True) elements = soup.select("[{}]".format("],[".join(attributes.keys()))) # Always use the URL from the base element if it exists. # https://www.w3schools.com/tags/tag_base.asp if base_element: host = URLHelper.make_absolute(host, base_element["href"]) found_requests = [] for element in elements: for attribute in attributes.keys(): if not element.has_attr(attribute): continue found_url = self.__trim_grave_accent(element[attribute]) if URLHelper.is_mailto(found_url): continue absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) return found_requests
def derived_get_requests(self): """Get all the new requests that were found in the response. Returns: list(:class:`nyawc.http.Request`): A list of new requests that were found. """ host = self.queue_item.response.url content = self.queue_item.response.text found_requests = [] for expression in self.__expressions: matches = re.findall(expression["raw"], content) for match in matches: found_url = match[expression["group"]] absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) return found_requests
def get_requests_from_content(self, host, content): """Find new requests from the given content. Args: host (str): The parent request URL. content (obj): The HTML content. Returns: list(obj): Requests that were found. """ found_requests = [] for expression in self.__expressions: matches = re.findall(expression["raw"], content) for match in matches: found_url = match[expression["group"]] absolute_url = URLHelper.make_absolute(host, found_url) found_requests.append(Request(absolute_url)) return found_requests
def __set_angular_version(self): """Find and set the AngularJS version as class attribute Returns: str: True if found and set, False otherwise. """ if self.__args.angular_version: self.__angular_version = self.__args.angular_version colorlog.getLogger().info("Found AngularJS version " + self.__angular_version + " in the arguments.") return True colorlog.getLogger().info( "Looking for AngularJS version using a headless browser.") colorlog.getLogger().info("Waiting until DOM is completely loaded.") self.__angular_version = BrowserHelper.javascript( QueueItem(Request(self.__args.domain), Response(self.__args.domain)), "return angular.version.full") if self.__angular_version: colorlog.getLogger().info("Found AngularJS version " + self.__angular_version + ".") return True colorlog.getLogger().error( "Couldn't determine the AngularJS version (`angular.version.full` threw an exception)." ) colorlog.getLogger().error( "If you are certain this URL uses AngularJS, specify the version via the `--angular-version` argument." ) return False
# 'http': 'http://*****:*****@host:port', # 'https': 'https://*****:*****@host:port', # SOCKS # 'http': 'socks5://user:pass@host:port', # 'https': 'socks5://user:pass@host:port' } options.identity.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" }) # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html) options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8. options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30. # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html) options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20. options.routing.routes = [ # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array. "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times. ] # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html) options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False. options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True. options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None. crawler = Crawler(options) crawler.start_with(Request("https://finnwea.com/"))
path_scripts[path].append(query) open(domain + "_ServerScripts_Links.json", "w").write( str(json.dumps(path_scripts))) # Json format was not valid print(" ServerScripts > {}".format(queue_item.request.url)) else: if ("?" in queue_item.request.url): path = queue_item.request.url[:queue_item.request.url.find("?")] query = queue_item.request.url[queue_item.request.url.find("?"):] else: path = queue_item.request.url path_other[path].append(query) open(domain + "_Others_Links.json", "w").write(str(json.dumps(path_other))) print(" Others> {}".format(queue_item.request.url)) return CrawlerActions.DO_CONTINUE_CRAWLING options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. crawler = Crawler(options) crawler.start_with(Request(host))