예제 #1
0
    def start(self):
        """Start the crawler."""

        crawler = Crawler(self.__options)
        signal.signal(signal.SIGINT, self.__signal_handler)

        startpoint = Request(self.__args.domain)
        crawler.start_with(startpoint)
예제 #2
0
    def start(self):
        """Start the crawler."""

        if self.__set_angular_version():
            crawler = Crawler(self.__options)
            signal.signal(signal.SIGINT, self.__signal_handler)

            startpoint = Request(self.__args.domain)
            crawler.start_with(startpoint)

        # Exit the process with the correct status code
        sys.exit(not self.__vulnerable_items)
    def test_crawl_website(self):
        """Crawl the website in `test/` and check if the count is correct."""

        if not self.travis:
            print("\n\nPlease note that the 'TestSite' unit test did not run.")
            print("It will only run in Travis since it needs a webserver.")
            return

        options = Options()
        options.callbacks.crawler_after_finish
        crawler = Crawler(options)
        crawler.start_with(Request("http://localhost/"))

        self.assertEqual(crawler.queue.count_total, 16)
    # 'http': 'http://*****:*****@host:port',
    # 'https': 'https://*****:*****@host:port',

    # SOCKS
    # 'http': 'socks5://user:pass@host:port',
    # 'https': 'socks5://user:pass@host:port'
}
options.identity.headers.update({
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
})

# Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
options.performance.max_threads = 10  # The maximum amount of simultaneous threads to use for crawling. Default is 8.
options.performance.request_timeout = 15  # The request timeout in seconds (throws an exception if exceeded). Default is 30.

# Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
options.routing.minimum_threshold = 4  # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
options.routing.routes = [
    # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
    "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"  # Only crawl /blog/{some-blog-alias} 4 times.
]

# Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
options.misc.debug = False  # If debug is enabled extra information will be logged to the console. Default is False.
options.misc.verify_ssl_certificates = True  # If verification is enabled all SSL certificates will be checked for validity. Default is True.
options.misc.trusted_certificates = None  # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.

crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
예제 #5
0
        path_scripts[path].append(query)

        open(domain + "_ServerScripts_Links.json", "w").write(
            str(json.dumps(path_scripts)))  # Json format was not valid
        print(" ServerScripts > {}".format(queue_item.request.url))

    else:

        if ("?" in queue_item.request.url):
            path = queue_item.request.url[:queue_item.request.url.find("?")]
            query = queue_item.request.url[queue_item.request.url.find("?"):]
        else:
            path = queue_item.request.url

        path_other[path].append(query)

        open(domain + "_Others_Links.json",
             "w").write(str(json.dumps(path_other)))
        print(" Others> {}".format(queue_item.request.url))

    return CrawlerActions.DO_CONTINUE_CRAWLING


options.callbacks.crawler_before_start = cb_crawler_before_start  # Called before the crawler starts crawling. Default is a null route.
options.callbacks.crawler_after_finish = cb_crawler_after_finish  # Called after the crawler finished crawling. Default is a null route.
options.callbacks.request_before_start = cb_request_before_start  # Called before the crawler starts a new request. Default is a null route.
options.callbacks.request_after_finish = cb_request_after_finish  # Called after the crawler finishes a request. Default is a null route.

crawler = Crawler(options)
crawler.start_with(Request(host))