Пример #1
0
    def __add_scraped_requests_to_queue(self, queue_item, scraped_requests):
        """Convert the scraped requests to queue items, return them and also add them to the queue.

        Args:
            queue_item (:class:`nyawc.QueueItem`): The request/response pair that finished.
            new_requests list(:class:`nyawc.http.Request`): All the requests that were found during this request.

        Returns:
            list(:class:`nyawc.QueueItem`): The new queue items.

        """

        new_queue_items = []

        for scraped_request in scraped_requests:
            HTTPRequestHelper.patch_with_options(scraped_request,
                                                 self.__options, queue_item)

            if not HTTPRequestHelper.complies_with_scope(
                    queue_item, scraped_request, self.__options.scope):
                continue

            if self.queue.has_request(scraped_request):
                continue

            scraped_request.depth = queue_item.request.depth + 1
            if self.__options.scope.max_depth is not None:
                if scraped_request.depth > self.__options.scope.max_depth:
                    continue

            new_queue_item = self.queue.add_request(scraped_request)
            new_queue_items.append(new_queue_item)

        return new_queue_items
Пример #2
0
    def start_with(self, request):
        """Start the crawler using the given request.

        Args:
            request (:class:`nyawc.http.Request`): The startpoint for the crawler.

        """

        HTTPRequestHelper.patch_with_options(request, self.__options)
        self.queue.add_request(request)

        self.__crawler_start()
Пример #3
0
    def test_hash_is_always_the_same(self):
        """Ensure the hashes are calculated correctly by checking for duplicates in the queue."""

        options = Options()
        queue = Queue(options)

        for index in range(0, 100):
            request = Request("https://example.ltd?1=1#2=2")
            HTTPRequestHelper.patch_with_options(request, options)
            request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd')
            queue.add_request(request)

        self.assertEqual(queue.count_total, 1)
Пример #4
0
    def start(self):
        """Start the crawler."""

        startpoint = Request(self.__args.domain)
        HTTPRequestHelper.patch_with_options(startpoint, self.__options)

        if self.__set_angular_version(startpoint):
            crawler = Crawler(self.__options)
            signal.signal(signal.SIGINT, self.__signal_handler)

            crawler.start_with(startpoint)

        # Exit the process with the correct status code
        sys.exit(not self.__vulnerable_items)
Пример #5
0
    def __request_finish(self, queue_item, new_requests):
        """Called when the crawler finished the given queued item.

        Args:
            queue_item (obj): The request/response pair that finished.
            new_requests list(obj): All the requests that were found during this request.

        """

        new_queue_items = []
        action = None

        if queue_item.status not in [
                QueueItem.STATUS_ERRORED, QueueItem.STATUS_CANCELLED
        ]:
            for new_request in new_requests:
                HTTPRequestHelper.patch_with_options(new_request,
                                                     self.__options,
                                                     queue_item)

                if not HTTPRequestHelper.complies_with_scope(
                        queue_item, new_request, self.__options.scope):
                    continue

                if self.__queue.has_request(new_request):
                    continue

                new_request.depth = queue_item.request.depth + 1
                if self.__options.scope.max_depth is not None:
                    if new_request.depth > self.__options.scope.max_depth:
                        continue

                new_queue_item = self.__queue.add_request(new_request)
                new_queue_items.append(new_queue_item)

            self.__queue.move(queue_item, QueueItem.STATUS_FINISHED)
            action = self.__options.callbacks.request_after_finish(
                self.__queue, queue_item, new_queue_items)

        if self.__stopping:
            return

        if action == CrawlerActions.DO_STOP_CRAWLING:
            self.__crawler_stop()
            return

        if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None:
            self.__spawn_new_requests()
            return
Пример #6
0
    def __get_browser(queue_item=None):
        """Get the PhantomJS browser.

        Args:
            queue_item (:class:`nyawc.QueueItem`): Use authentication/headers/cookies etc from this queue item (if given).

        Returns:
            obj: The PhantomJS Selenium object.

        """

        capabilities = dict(DesiredCapabilities.PHANTOMJS)
        service = []

        if queue_item:

            # Add authentication header to request
            if queue_item.request.auth:
                queue_item.request.auth(queue_item.request)

            # Add cookie header to request
            if queue_item.request.cookies:
                cookie_string = HTTPRequestHelper.get_cookie_header(queue_item)
                queue_item.request.headers["Cookie"] = cookie_string

            # Add headers to PhantomJS
            if queue_item.request.headers:
                default_headers = requests.utils.default_headers()
                for (key, value) in queue_item.request.headers.items():
                    if key.lower() == "user-agent":
                        capabilities["phantomjs.page.settings.userAgent"] = value
                    else:
                        
                        # PhantomJS has issues with executing JavaScript on pages with GZIP encoding.
                        # See link for more information (https://github.com/detro/ghostdriver/issues/489).
                        if key == "Accept-Encoding" and "gzip" in value:
                            continue

                        capabilities["phantomjs.page.customHeaders." + key] = value

            # Proxies
            if queue_item.request.proxies:
                service.extend(BrowserHelper.__proxies_to_service_args(queue_item.request.proxies))

        driver_path = BrowserHelper.__get_phantomjs_driver()
        return webdriver.PhantomJS(
            executable_path=driver_path,
            desired_capabilities=capabilities,
            service_args=service
        )