def __init__(self, options): """Constructs a Crawler instance. Args: options (obj): The options to use for the current crawling runtime. """ self.__options = options self.__queue = Queue(self.__options)
def test_hash_is_always_the_same(self): """Ensure the hashes are calculated correctly by checking for duplicates in the queue.""" options = Options() queue = Queue(options) for index in range(0, 100): request = Request("https://example.ltd?1=1#2=2") HTTPRequestHelper.patch_with_options(request, options) request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd') queue.add_request(request) self.assertEqual(queue.count_total, 1)
def __init__(self, options): """Constructs a Crawler instance. Args: options (obj): The options to use for the current crawling runtime. """ self.__options = options self.queue = Queue(self.__options) self.__stopping = False self.__stopped = False self.__lock = threading.Lock()
def __init__(self, options): """Constructs a Crawler instance. Args: options (:class:`nyawc.Options`): The options to use for the current crawling runtime. """ signal.signal(signal.SIGINT, self.__signal_handler) self.queue = Queue(options) self.__options = options self.__should_stop = False self.__stopping = False self.__stopped = False self.__threads = {} self.__lock = threading.Lock()
def test_hash_option_subdomain_must_not_match(self): """Ensure different subdomains are treated as one queue item if subdomains must match is False.""" options = Options() options.scope.subdomain_must_match = False queue = Queue(options) queue.add_request(Request("https://www.example.ltd")) queue.add_request(Request("https://webmail.example.ltd")) queue.add_request(Request("https://subdomain.example.ltd")) self.assertEqual(queue.count_total, 1)
def test_hash_option_protocol_must_not_match(self): """Ensure different protocols are treated as one queue item if protocols must match is False.""" options = Options() options.scope.protocol_must_match = False queue = Queue(options) queue.add_request(Request("https://example.ltd")) queue.add_request(Request("http://example.ltd")) queue.add_request(Request("ftp://example.ltd")) self.assertEqual(queue.count_total, 1)
def test_hash_different_encoded_and_decoded_values(self): """Ensure encoded and decoded values have a different hash.""" queue = Queue(Options()) queue.add_request(Request("http://example.ltd?val={{aaaa}}")) queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D")) self.assertEqual(queue.count_total, 2)
def test_hash_different_query_order(self): """Ensure query parameters in different orders are treated as one queue item.""" queue = Queue(Options()) queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a")) queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c")) queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c")) self.assertEqual(queue.count_total, 1)
class Crawler(object): """The main Crawler class which handles the crawling recursion, queue and processes. Attributes: queue (:class:`nyawc.Queue`): The request/response pair queue containing everything to crawl. __options (:class:`nyawc.Options`): The options to use for the current crawling runtime. __should_stop (bool): If the crawler should stop the crawling process. __stopping (bool): If the crawler is stopping the crawling process. __stopped (bool): If the crawler finished stopping the crawler process. __threads (obj): All currently running threads, as queue item hash => :class:`nyawc.CrawlerThread`. __lock (obj): The callback lock to prevent race conditions. """ def __init__(self, options): """Constructs a Crawler instance. Args: options (:class:`nyawc.Options`): The options to use for the current crawling runtime. """ signal.signal(signal.SIGINT, self.__signal_handler) self.queue = Queue(options) self.__options = options self.__should_stop = False self.__stopping = False self.__stopped = False self.__threads = {} self.__lock = threading.Lock() def __signal_handler(self, signum, frame): """On sigint (e.g. CTRL+C) stop the crawler. Args: signum (int): The signal number. frame (obj): The current stack frame. """ self.__crawler_stop() def start_with(self, request): """Start the crawler using the given request. Args: request (:class:`nyawc.http.Request`): The startpoint for the crawler. """ HTTPRequestHelper.patch_with_options(request, self.__options) self.queue.add_request(request) self.__crawler_start() def __spawn_new_requests(self): """Spawn new requests until the max threads option value is reached. Note: If no new requests were spawned and there are no requests in progress the crawler will stop crawling. """ in_progress_count = len( self.queue.get_all(QueueItem.STATUS_IN_PROGRESS)) while in_progress_count < self.__options.performance.max_threads: if self.__spawn_new_request(): in_progress_count += 1 else: break if in_progress_count == 0: self.__crawler_stop() def __spawn_new_request(self): """Spawn the first queued request if there is one available. Returns: bool: True if a new request was spawned, false otherwise. """ first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED) if first_in_line is None: return False self.__request_start(first_in_line) return True def __wait_for_current_threads(self): """Wait until all the current threads are finished.""" for thread in list(self.__threads.values()): thread.join() def __crawler_start(self): """Spawn the first X queued request, where X is the max threads option. Note: The main thread will sleep until the crawler is finished. This enables quiting the application using sigints (see http://stackoverflow.com/a/11816038/2491049). """ try: self.__options.callbacks.crawler_before_start() except Exception as e: print(e) print(traceback.format_exc()) self.__spawn_new_requests() while not self.__stopped: if self.__should_stop: self.__crawler_stop() time.sleep(1) def __crawler_stop(self): """Mark the crawler as stopped. Note: If :attr:`__stopped` is True, the main thread will be stopped. Every piece of code that gets executed after :attr:`__stopped` is True could cause Thread exceptions and or race conditions. """ if self.__stopping: return self.__stopping = True self.__wait_for_current_threads() self.queue.move_bulk( [QueueItem.STATUS_QUEUED, QueueItem.STATUS_IN_PROGRESS], QueueItem.STATUS_CANCELLED) self.__crawler_finish() self.__stopped = True def __crawler_finish(self): """Called when the crawler is finished because there are no queued requests left or it was stopped.""" try: self.__options.callbacks.crawler_after_finish(self.queue) except Exception as e: print(e) print(traceback.format_exc()) def __request_start(self, queue_item): """Execute the request in given queue item. Args: queue_item (:class:`nyawc.QueueItem`): The request/response pair to scrape. """ try: action = self.__options.callbacks.request_before_start( self.queue, queue_item) except Exception as e: action = None print(e) print(traceback.format_exc()) if action == CrawlerActions.DO_STOP_CRAWLING: self.__should_stop = True if action == CrawlerActions.DO_SKIP_TO_NEXT: self.queue.move(queue_item, QueueItem.STATUS_FINISHED) self.__spawn_new_requests() if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None: self.queue.move(queue_item, QueueItem.STATUS_IN_PROGRESS) thread = CrawlerThread(self.__request_finish, self.__lock, self.__options, queue_item) self.__threads[queue_item.get_hash()] = thread thread.daemon = True thread.start() def __request_finish(self, queue_item, new_requests, request_failed=False): """Called when the crawler finished the given queue item. Args: queue_item (:class:`nyawc.QueueItem`): The request/response pair that finished. new_requests list(:class:`nyawc.http.Request`): All the requests that were found during this request. request_failed (bool): True if the request failed (if needs to be moved to errored). """ if self.__stopping: return del self.__threads[queue_item.get_hash()] if request_failed: new_queue_items = [] self.queue.move(queue_item, QueueItem.STATUS_ERRORED) else: new_queue_items = self.__add_scraped_requests_to_queue( queue_item, new_requests) self.queue.move(queue_item, QueueItem.STATUS_FINISHED) try: action = self.__options.callbacks.request_after_finish( self.queue, queue_item, new_queue_items) except Exception as e: action = None print(e) print(traceback.format_exc()) if action == CrawlerActions.DO_STOP_CRAWLING: self.__should_stop = True if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None: self.__spawn_new_requests() def __add_scraped_requests_to_queue(self, queue_item, scraped_requests): """Convert the scraped requests to queue items, return them and also add them to the queue. Args: queue_item (:class:`nyawc.QueueItem`): The request/response pair that finished. new_requests list(:class:`nyawc.http.Request`): All the requests that were found during this request. Returns: list(:class:`nyawc.QueueItem`): The new queue items. """ new_queue_items = [] for scraped_request in scraped_requests: HTTPRequestHelper.patch_with_options(scraped_request, self.__options, queue_item) if not HTTPRequestHelper.complies_with_scope( queue_item, scraped_request, self.__options.scope): continue if self.queue.has_request(scraped_request): continue scraped_request.depth = queue_item.request.depth + 1 if self.__options.scope.max_depth is not None: if scraped_request.depth > self.__options.scope.max_depth: continue new_queue_item = self.queue.add_request(scraped_request) new_queue_items.append(new_queue_item) return new_queue_items
class Crawler: """The main Crawler class which handles the crawling recursion, queue and processes. Attributes: __options (obj): The options to use for the current crawling runtime. __queue (obj): The request/response pair queue containing everything to crawl. __stopping (bool): If the crawler is topping the crawling process. __stopped (bool): If the crawler finished stopping the crawler process. __lock (obj): The callback lock to prevent race conditions. """ __options = None __queue = None __stopping = False __stopped = False __lock = threading.Lock() def __init__(self, options): """Constructs a Crawler instance. Args: options (obj): The options to use for the current crawling runtime. """ self.__options = options self.__queue = Queue(self.__options) def start_with(self, request): """Start the crawler using the given request. Args: request (obj): The startpoint for the crawler. """ HTTPRequestHelper.patch_with_options(request, self.__options) self.__queue.add_request(request) self.__crawler_start() def __spawn_new_requests(self): """Spawn new requests until the max processes option value is reached. Note: If no new requests were spawned and there are no requests in progress the crawler will stop crawling. """ concurrent_requests_count = self.__queue.count_in_progress new_requests_spawned = False while concurrent_requests_count < self.__options.performance.max_threads: if self.__spawn_new_request(): new_requests_spawned = True concurrent_requests_count += 1 else: break if concurrent_requests_count == 0 and not new_requests_spawned and not self.__stopping: self.__crawler_stop() def __spawn_new_request(self): """Spawn the first queued request if there is one available. Returns: bool: If a new request was spawned. """ first_in_line = self.__queue.get_first(QueueItem.STATUS_QUEUED) if first_in_line is None: return False self.__request_start(first_in_line) return True def __crawler_start(self): """Spawn the first X queued request, where X is the max threads option. Note: The main thread will sleep until the crawler stopped or on keyboard interruption. This prevents race conditions where sub threads will callback to the main thread while the main thread is already finished. """ self.__options.callbacks.crawler_before_start() try: self.__spawn_new_requests() while not self.__stopped: time.sleep(1) except (KeyboardInterrupt, SystemExit): pass def __crawler_stop(self, force_quit=False): """Mark the crawler as stopped. Note: If self.__stopped is True, the main thread will be stopped. Every piece of code that gets executed after self.__stopped is True could cause Thread exceptions and or race conditions. Args: force_quit (bool): Also cancel any ongoing requests. """ self.__stopping = True for status in [QueueItem.STATUS_QUEUED, QueueItem.STATUS_IN_PROGRESS]: for queue_item in self.__queue.get_all(status).values(): self.__queue.move(queue_item, QueueItem.STATUS_CANCELLED) self.__crawler_finish() self.__stopped = True def __crawler_finish(self): """Called when the crawler is finished because there are no queued requests left or it was stopped.""" self.__options.callbacks.crawler_after_finish(self.__queue) def __request_start(self, queue_item): """Execute the request in given queue item. Args: queue_item (obj): The request/response pair to scrape. """ action = self.__options.callbacks.request_before_start( self.__queue, queue_item) if action == CrawlerActions.DO_STOP_CRAWLING: self.__crawler_stop(True) return if action == CrawlerActions.DO_SKIP_TO_NEXT: self.__queue.move(queue_item, QueueItem.STATUS_FINISHED) return if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None: self.__queue.move(queue_item, QueueItem.STATUS_IN_PROGRESS) thread = CrawlerThread(self.__request_finish, self.__lock, self.__options, queue_item) thread.daemon = True thread.start() def __request_finish(self, queue_item, new_requests): """Called when the crawler finished the given queued item. Args: queue_item (obj): The request/response pair that finished. new_requests list(obj): All the requests that were found during this request. """ new_queue_items = [] action = None if queue_item.status not in [ QueueItem.STATUS_ERRORED, QueueItem.STATUS_CANCELLED ]: for new_request in new_requests: HTTPRequestHelper.patch_with_options(new_request, self.__options, queue_item) if not HTTPRequestHelper.complies_with_scope( queue_item, new_request, self.__options.scope): continue if self.__queue.has_request(new_request): continue new_request.depth = queue_item.request.depth + 1 if self.__options.scope.max_depth is not None: if new_request.depth > self.__options.scope.max_depth: continue new_queue_item = self.__queue.add_request(new_request) new_queue_items.append(new_queue_item) self.__queue.move(queue_item, QueueItem.STATUS_FINISHED) action = self.__options.callbacks.request_after_finish( self.__queue, queue_item, new_queue_items) if self.__stopping: return if action == CrawlerActions.DO_STOP_CRAWLING: self.__crawler_stop() return if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None: self.__spawn_new_requests() return