def start(self): for url in CRAWLER_INITIAL_URLS: self.enqueue(resource_from_url(url)) while (not self.q.empty() and not self.limits_reached()): resource = self.dequeue() for link in download_and_get_links(resource): self.visit(link)
def visit(self, url_or_resource): if isinstance(url_or_resource, Resource): resource = url_or_resource else: resource = resource_from_url(url_or_resource) logging.info('visiting {}'.format(resource.url)) if resource is None or self.is_seen(resource): return self.enqueue(resource)