def start(self): """Start the crawler.""" crawler = Crawler(self.__options) signal.signal(signal.SIGINT, self.__signal_handler) startpoint = Request(self.__args.domain) crawler.start_with(startpoint)
def start(self): """Start the crawler.""" if self.__set_angular_version(): crawler = Crawler(self.__options) signal.signal(signal.SIGINT, self.__signal_handler) startpoint = Request(self.__args.domain) crawler.start_with(startpoint) # Exit the process with the correct status code sys.exit(not self.__vulnerable_items)
def test_crawl_website(self): """Crawl the website in `test/` and check if the count is correct.""" if not self.travis: print("\n\nPlease note that the 'TestSite' unit test did not run.") print("It will only run in Travis since it needs a webserver.") return options = Options() options.callbacks.crawler_after_finish crawler = Crawler(options) crawler.start_with(Request("http://localhost/")) self.assertEqual(crawler.queue.count_total, 16)
# 'http': 'http://*****:*****@host:port', # 'https': 'https://*****:*****@host:port', # SOCKS # 'http': 'socks5://user:pass@host:port', # 'https': 'socks5://user:pass@host:port' } options.identity.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" }) # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html) options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8. options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30. # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html) options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20. options.routing.routes = [ # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array. "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times. ] # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html) options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False. options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True. options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None. crawler = Crawler(options) crawler.start_with(Request("https://finnwea.com/"))
path_scripts[path].append(query) open(domain + "_ServerScripts_Links.json", "w").write( str(json.dumps(path_scripts))) # Json format was not valid print(" ServerScripts > {}".format(queue_item.request.url)) else: if ("?" in queue_item.request.url): path = queue_item.request.url[:queue_item.request.url.find("?")] query = queue_item.request.url[queue_item.request.url.find("?"):] else: path = queue_item.request.url path_other[path].append(query) open(domain + "_Others_Links.json", "w").write(str(json.dumps(path_other))) print(" Others> {}".format(queue_item.request.url)) return CrawlerActions.DO_CONTINUE_CRAWLING options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. crawler = Crawler(options) crawler.start_with(Request(host))