Exemplo n.º 1
0
    def test_hash_different_encoded_and_decoded_values(self):
        """Ensure encoded and decoded values have a different hash."""

        queue = Queue(Options())

        queue.add_request(Request("http://example.ltd?val={{aaaa}}"))
        queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D"))

        self.assertEqual(queue.count_total, 2)
Exemplo n.º 2
0
    def test_hash_different_query_order(self):
        """Ensure query parameters in different orders are treated as one queue item."""

        queue = Queue(Options())

        queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a"))
        queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c"))
        queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c"))

        self.assertEqual(queue.count_total, 1)
Exemplo n.º 3
0
    def test_hash_option_subdomain_must_not_match(self):
        """Ensure different subdomains are treated as one queue item if subdomains must match is False."""

        options = Options()
        options.scope.subdomain_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://www.example.ltd"))
        queue.add_request(Request("https://webmail.example.ltd"))
        queue.add_request(Request("https://subdomain.example.ltd"))

        self.assertEqual(queue.count_total, 1)
Exemplo n.º 4
0
    def test_hash_option_protocol_must_not_match(self):
        """Ensure different protocols are treated as one queue item if protocols must match is False."""

        options = Options()
        options.scope.protocol_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://example.ltd"))
        queue.add_request(Request("http://example.ltd"))
        queue.add_request(Request("ftp://example.ltd"))

        self.assertEqual(queue.count_total, 1)
Exemplo n.º 5
0
    def start(self):
        """Start the crawler."""

        crawler = Crawler(self.__options)
        signal.signal(signal.SIGINT, self.__signal_handler)

        startpoint = Request(self.__args.domain)
        crawler.start_with(startpoint)
Exemplo n.º 6
0
 def crawl_url(self, url):
     """
     :param url: url for crawling
     :return: list of crawled urls  as result crawling
     """
     self.crawled = []
     self.crawler.start_with(Request(urlparse(url).scheme + "://" + urlparse(url).netloc))
     return self.crawled
    def test_regex_url_count(self):
        """Test if the amount of URLs found complies with the expected amount."""

        html = ""
        for url in self.__urls:
            html += "\n" + url["test"]

        finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response()))
        matches = finder.get_requests_from_content(self.__host, html)

        self.assertEqual(len(matches), 30)
Exemplo n.º 8
0
    def crawl(self, url):
        if self.crawler is None:
            print "Cralwer is not setted up"
            return

        parsedurl = urlparse(url)
        domain = parsedurl.scheme + "://" + parsedurl.netloc

        self.links = []
        self.crawler.start_with(Request(domain))
        return self.links
Exemplo n.º 9
0
    def start(self):
        """Start the crawler."""

        if self.__set_angular_version():
            crawler = Crawler(self.__options)
            signal.signal(signal.SIGINT, self.__signal_handler)

            startpoint = Request(self.__args.domain)
            crawler.start_with(startpoint)

        # Exit the process with the correct status code
        sys.exit(not self.__vulnerable_items)
    def test_regex_url_matches(self):
        """Test if all the URLs match the found URLs."""
        
        for url in self.__urls:
            finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response()))
            requests = finder.get_requests_from_content(self.__host, url["test"])

            if url["must_pass"]:
                self.assertEqual(len(requests), 1)
                self.assertEqual(requests[0].url, url["url"])
            else:
                self.assertEqual(len(requests), 0)
Exemplo n.º 11
0
    def test_hash_is_always_the_same(self):
        """Ensure the hashes are calculated correctly by checking for duplicates in the queue."""

        options = Options()
        queue = Queue(options)

        for index in range(0, 100):
            request = Request("https://example.ltd?1=1#2=2")
            HTTPRequestHelper.patch_with_options(request, options)
            request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd')
            queue.add_request(request)

        self.assertEqual(queue.count_total, 1)
Exemplo n.º 12
0
    def get_requests_from_content(self, host, content):
        """Find new requests from the given content.

        Args:
            host (str): The parent request URL.
            content (obj): The HTML content.

        Returns:
            list(obj): Requests that were found.

        """

        soup = self.__queue_item.get_soup_response()
        a_elements = soup.find_all("a", href=True)
        link_elements = soup.find_all("link", href=True)
        script_elements = soup.find_all("script", src=True)

        found_requests = []

        for a_element in a_elements:
            found_url = self.__trim_grave_accent(a_element["href"])
            if not URLHelper.is_mailto(found_url):
                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        for link_element in link_elements:
            found_url = self.__trim_grave_accent(link_element["href"])
            if not URLHelper.is_mailto(found_url):
                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        for script_element in script_elements:
            found_url = self.__trim_grave_accent(script_element["src"])
            if not URLHelper.is_mailto(found_url):
                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        return found_requests
    def test_crawl_website(self):
        """Crawl the website in `test/` and check if the count is correct."""

        if not self.travis:
            print("\n\nPlease note that the 'TestSite' unit test did not run.")
            print("It will only run in Travis since it needs a webserver.")
            return

        options = Options()
        options.callbacks.crawler_after_finish
        crawler = Crawler(options)
        crawler.start_with(Request("http://localhost/"))

        self.assertEqual(crawler.queue.count_total, 16)
Exemplo n.º 14
0
    def test_soup_url_count(self):
        """Test if the amount of URLs found complies with the expected amount."""

        html = ""
        for url in self.__urls:
            html += "\n" + url["test"]

        request = Request(self.__host)
        response = Response()
        response.text = html

        finder = HTMLSoupFormScraper(Options(), QueueItem(request, response))
        matches = finder.get_requests()

        self.assertEqual(len(matches), 4)
    def test_soup_url_matches(self):
        """Test if all the URLs match the found URLs."""

        for url in self.__urls:
            request = Request(self.__host)
            response = Response()
            response.text = url["test"]

            finder = SoupFormScraper(Options(), QueueItem(request, response))
            requests = finder.get_requests()

            if url["must_pass"]:
                self.assertEqual(requests[0].url, url["url"])
                self.assertEqual(len(requests), 1)
            else:
                self.assertEqual(len(requests), 0)
    def test_version_detect(self):
        """Check if a single (stable) AngularJS version is detected by ACSTIS."""

        server = LocalAngularServer()
        server.start(LocalAngularServer.HANDLER_VULNERABLE_TEST, {"asset": "https://code.angularjs.org/1.5.8/angular.min.js"})

        domain = "http://" + server.url + "?vulnerable=payload"

        version = BrowserHelper.javascript(
            QueueItem(Request(domain), Response(domain)),
            "return angular.version.full"
        )

        server.stop()

        self.assertEqual("1.5.8", version)
Exemplo n.º 17
0
    def __get_request(self, host, soup):
        """Build a request from the given soup form.

        Args:
            host str: The URL of the current queue item.
            soup (obj): The BeautifulSoup form.

        Returns:
            :class:`nyawc.http.Request`: The new Request.

        """

        url = URLHelper.make_absolute(host, self.__trim_grave_accent(soup["action"])) if soup.has_attr("action") else host
        method_original = soup["method"] if soup.has_attr("method") else "get"
        method = "post" if method_original.lower() == "post" else "get"
        data = self.__get_form_data(soup)

        return Request(url, method, data)
Exemplo n.º 18
0
    def derived_get_requests(self):
        """Get all the new requests that were found in the response.

        Returns:
            list(:class:`nyawc.http.Request`): A list of new requests that were found.

        """

        attributes = {
            "src": True,
            "href": True,
            "link": True,
            "script": True,
            "url": True
        }

        host = self.queue_item.response.url
        soup = self.queue_item.get_soup_response()
        base_element = soup.find("base", href=True)
        elements = soup.select("[{}]".format("],[".join(attributes.keys())))

        # Always use the URL from the base element if it exists.
        # https://www.w3schools.com/tags/tag_base.asp
        if base_element:
            host = URLHelper.make_absolute(host, base_element["href"])

        found_requests = []

        for element in elements:
            for attribute in attributes.keys():
                if not element.has_attr(attribute):
                    continue

                found_url = self.__trim_grave_accent(element[attribute])

                if URLHelper.is_mailto(found_url):
                    continue

                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        return found_requests
Exemplo n.º 19
0
    def derived_get_requests(self):
        """Get all the new requests that were found in the response.

        Returns:
            list(:class:`nyawc.http.Request`): A list of new requests that were found.

        """

        host = self.queue_item.response.url
        content = self.queue_item.response.text

        found_requests = []

        for expression in self.__expressions:
            matches = re.findall(expression["raw"], content)

            for match in matches:
                found_url = match[expression["group"]]
                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        return found_requests
    def get_requests_from_content(self, host, content):
        """Find new requests from the given content.

        Args:
            host (str): The parent request URL.
            content (obj): The HTML content.

        Returns:
            list(obj): Requests that were found.

        """

        found_requests = []

        for expression in self.__expressions:
            matches = re.findall(expression["raw"], content)

            for match in matches:
                found_url = match[expression["group"]]
                absolute_url = URLHelper.make_absolute(host, found_url)
                found_requests.append(Request(absolute_url))

        return found_requests
Exemplo n.º 21
0
    def __set_angular_version(self):
        """Find and set the AngularJS version as class attribute

        Returns:
            str: True if found and set, False otherwise.

        """

        if self.__args.angular_version:
            self.__angular_version = self.__args.angular_version
            colorlog.getLogger().info("Found AngularJS version " +
                                      self.__angular_version +
                                      " in the arguments.")
            return True

        colorlog.getLogger().info(
            "Looking for AngularJS version using a headless browser.")
        colorlog.getLogger().info("Waiting until DOM is completely loaded.")

        self.__angular_version = BrowserHelper.javascript(
            QueueItem(Request(self.__args.domain),
                      Response(self.__args.domain)),
            "return angular.version.full")

        if self.__angular_version:
            colorlog.getLogger().info("Found AngularJS version " +
                                      self.__angular_version + ".")
            return True

        colorlog.getLogger().error(
            "Couldn't determine the AngularJS version (`angular.version.full` threw an exception)."
        )
        colorlog.getLogger().error(
            "If you are certain this URL uses AngularJS, specify the version via the `--angular-version` argument."
        )
        return False
    # 'http': 'http://*****:*****@host:port',
    # 'https': 'https://*****:*****@host:port',

    # SOCKS
    # 'http': 'socks5://user:pass@host:port',
    # 'https': 'socks5://user:pass@host:port'
}
options.identity.headers.update({
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
})

# Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
options.performance.max_threads = 10  # The maximum amount of simultaneous threads to use for crawling. Default is 8.
options.performance.request_timeout = 15  # The request timeout in seconds (throws an exception if exceeded). Default is 30.

# Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
options.routing.minimum_threshold = 4  # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
options.routing.routes = [
    # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
    "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"  # Only crawl /blog/{some-blog-alias} 4 times.
]

# Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
options.misc.debug = False  # If debug is enabled extra information will be logged to the console. Default is False.
options.misc.verify_ssl_certificates = True  # If verification is enabled all SSL certificates will be checked for validity. Default is True.
options.misc.trusted_certificates = None  # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.

crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
Exemplo n.º 23
0
        path_scripts[path].append(query)

        open(domain + "_ServerScripts_Links.json", "w").write(
            str(json.dumps(path_scripts)))  # Json format was not valid
        print(" ServerScripts > {}".format(queue_item.request.url))

    else:

        if ("?" in queue_item.request.url):
            path = queue_item.request.url[:queue_item.request.url.find("?")]
            query = queue_item.request.url[queue_item.request.url.find("?"):]
        else:
            path = queue_item.request.url

        path_other[path].append(query)

        open(domain + "_Others_Links.json",
             "w").write(str(json.dumps(path_other)))
        print(" Others> {}".format(queue_item.request.url))

    return CrawlerActions.DO_CONTINUE_CRAWLING


options.callbacks.crawler_before_start = cb_crawler_before_start  # Called before the crawler starts crawling. Default is a null route.
options.callbacks.crawler_after_finish = cb_crawler_after_finish  # Called after the crawler finished crawling. Default is a null route.
options.callbacks.request_before_start = cb_request_before_start  # Called before the crawler starts a new request. Default is a null route.
options.callbacks.request_after_finish = cb_request_after_finish  # Called after the crawler finishes a request. Default is a null route.

crawler = Crawler(options)
crawler.start_with(Request(host))