class CrawlerDB(Thread): def __init__(self, delegate, initialLink=None, max_links=0, no_workers=10, id=str(uuid.uuid4())): Thread.__init__(self) self.noOfWorkers = no_workers self.workers = [] self.running = True self.paused = False self.condition = RLock() self.delegate = Delegate() self.listeners = [ ] # A list of listeners that want to listen to messages (ex: progress) from Crawler self.id = id self.initialLink = initialLink if initialLink is not None: self.add_initial_url(initialLink) self.max_links = max_links try: self.domain_regex = re.compile(get_domain(initialLink)) except Exception as ex: logging.error("Exception {}".format(ex)) # def __del__(self): # self.delegate.get_session().close() def add_initial_url(self, address): logger.info("Add initial URL") with self.condition: url = Url(url=address, absolute_url=address, type=Url.TYPE_INTERNAL, crawl_id=self.id, job_status=Url.JOB_STATUS_NOT_VISITED) self.delegate.url_create(url) def no_unvisited_urls(self): with self.condition: return self.delegate.url_count_unvisited(self.id) def no_pending_urls(self): with self.condition: return self.delegate.url_count_pending(self.id) def all_unvisited_urls(self): with self.condition: return self.delegate.url_get_all_unvisited(self.id) def no_visited_urls(self): with self.condition: return self.delegate.url_count_visited(self.id) def no_visited_resources(self): with self.condition: return self.delegate.resource_count_visited(self.id) def no_external_urls(self): with self.condition: return self.delegate.url_count_external(self.id) def next_unvisited_link_id(self): link_id = -1 with self.condition: url = self.delegate.url_get_first_unvisited(self.id) if url is not None: url.job_status = Url.JOB_STATUS_IN_PROGRESS # Set Url as in progress self.delegate.url_update(url) # self.increaseNoOfJobs() link_id = url.id return link_id def mark_url_as_visited(self, url_id): with self.condition: url = self.delegate.url_get_by_id(url_id) url.job_status = Url.JOB_STATUS_VISITED self.delegate.url_update(url) def _type_links(self, links): for link in links: if is_internal(get_domain(self.initialLink), link['absolute']): # internal link link['type'] = 'internal' else: # external link link['type'] = 'external' def _get_links(self, link_id): with self.condition: link = self.delegate.url_get_by_id(link_id) (page, links) = get_links(link.absolute_url) self._type_links(links) return page, links def link2url(self, link): url = Url(crawl_id=self.id) # url=link['href'], absolute_url=link['absolute'], type=link['type'], if 'href' in link: url.url = link['href'] if 'absolute' in link: url.absolute_url = link['absolute'] if 'type' in link: url.type = link['type'] if 'content' in link: url.raw_content = str(link['content']) url.text = str( link['content'] ) # TODO: Parse the raw_content and used only the text without HTML tags or other stuff return url def page2resource(self, page): resource = Resource(crawl_id=self.id) if 'url' in page: resource.absolute_url = page['url'] if 'content' in page: resource.content = page['content'] if 'elapsed' in page: resource.elapsed = page['elapsed'] return resource def add_links(self, links, src_resource_id=None, status_code=200): """Add a bunch of URLs using the resource id as source (page where found it)""" with self.condition: for link in links: url = self.link2url(link) if src_resource_id is not None: url.src_resource_id = src_resource_id # Check if destination resource exists, and if does mark it as visited try: src_resource = self.delegate.resource_get_by_id( src_resource_id) dest_resource = self.delegate.resource_get_by_absolute_url_and_crawl_id( url.absolute_url, src_resource.crawl_id) if dest_resource is not None: url.job_status = Url.JOB_STATUS_VISITED url.dst_resource_id = dest_resource.id url.status_code = status_code except Exception as e: logger.warning("Exception {}".format(e)) self.delegate.url_create(url) def add_resource(self, page): with self.condition: if not self.delegate.resource_is_present(crawlId=self.id): resource = self.page2resource(page) self.delegate.resource_create(resource) def connect_url_to_destination(self, url_id, resource_id): with self.condition: url = self.delegate.url_get_by_id(url_id) url.dst_resource_id = resource_id self.delegate.url_update(url) def resource_get_by_absolute_url_and_crawl_id(self, address, crawler_id): with self.condition: resource = self.delegate.resource_get_by_absolute_url_and_crawl_id( address, crawler_id) return resource def resource_create(self, page): with self.condition: try: resource = self.page2resource(page) self.delegate.resource_create(resource) except Exception as e: logger.warn("{} Exception {}}.".format( currentThread().getName(), e)) return resource def run(self): # Initialize workers for i in range(self.noOfWorkers): self.workers.append( Thread(target=self.workerJob, kwargs={"crawlId": self.id}, name="Thread-{}".format(i))) # Start workers self._start_all_workers() while self.running: logger.debug("[%s] Crawler thread cycle started." % (currentThread().getName())) if self.paused: logger.debug("[%s] Crawler paused." % (currentThread().getName())) continue logger.debug("[%s] Crawler check if jobs are done." % (currentThread().getName())) if self._are_jobs_done(): logger.debug("Crawler is shutting down") self.setRunning(False) break else: logger.debug("[%s] Crawler's jos are NOT done." % (currentThread().getName())) logger.debug("[%s] Crawler sleep." % (currentThread().getName())) time.sleep(1) # Join them self._join_all_workers() # self.delegate.get_session().close() msg = { "status": "done", "visited": self.no_visited_urls(), "to_visit": self.no_unvisited_urls(), "max_links": self.max_links, "crawlId": self.id } self.notify(msg) def workerJob(self, crawlId): while self.running: logger.debug("[%s] Worker thread cycle started." % (currentThread().getName())) if self.paused: continue # If max pages specified see if we already reached it if self.max_links > 0: no_pages_visited = self.no_visited_resources() if no_pages_visited >= self.max_links: continue # Grab next job link_id = self.next_unvisited_link_id() logger.debug("[%s] Next link [%d]." % (currentThread().getName(), link_id)) if 'link_id' in locals() and link_id != -1: logger.debug("[%s] Current link : %d" % (currentThread().getName(), link_id)) page, links = self._get_links(link_id) logger.debug("[%s] Discovered [%d] links." % (currentThread().getName(), len(links))) try: with self.condition: # Update links status code url = self.delegate.url_get_by_id(link_id) url.status_code = page['status-code'] self.delegate.url_update(url) if page['status-code'] == 200: # 1.Add Resource 2.Link URLs to (new | existing) Resources resource = self.resource_get_by_absolute_url_and_crawl_id( page['url'], self.id) if resource is None: #Add it only if max links not reached maximum_reached = False if self.max_links > 0: # We have a max_link specified no_pages_visited = self.no_visited_resources( ) if no_pages_visited >= self.max_links: maximum_reached = True if not maximum_reached: resource = self.resource_create(page) self.connect_url_to_destination( link_id, resource.id) logger.debug( "[%s] Adding links to DB linked to resource [%d]" % (currentThread().getName(), resource.id)) self.add_links(links, resource.id, page['status-code']) else: # Resource already added only make the end connection self.connect_url_to_destination( link_id, resource.id) else: pass self.mark_url_as_visited(link_id) msg = { "status": "in_progress", "visited": self.no_visited_urls(), "to_visit": self.no_unvisited_urls(), "max_links": self.max_links, "crawlId": crawlId, "currentWorker": currentThread().getName() } self.notify(msg) except Exception as e: print("Error {}".format(e)) logger.debug("[%s] cycle ended." % (currentThread().getName())) else: logger.debug("[%s] is shutting down." % (currentThread().getName())) # self.delegate.get_session().close() def stop(self): self.setRunning(False) def pause(self): self.paused = True def resume(self): if self.paused: self.paused = False def _start_all_workers(self): for w in self.workers: w.start() def _are_jobs_done(self): # Test if noOfJobs == 0 and to_visit == 0 # no_of_jobs = self.getNoOfJobs() # FIXME: If a thread grabs the initial link, while here, no_unvisited_urls() will # return zero (on next line) , also the no_of_jobs are zero so the Crawler # will initiate shutdown no_pending_urls = self.no_pending_urls() logger.debug("Crawler: _are_jobs_done(...) : no_pendind_urls = %d " % (no_pending_urls, )) if no_pending_urls == 0: return True # Test if we have reached the max no of pages if self.max_links > 0: no_pages_visited = self.no_visited_resources() if no_pages_visited >= self.max_links: return True return False def _join_all_workers(self): for w in self.workers: w.join() def setRunning(self, status): self.running = status def addListener(self, callback): self.listeners.append(callback) def removeListener(self, callback): self.listeners.remove(callback) def notify(self, msg): for callback in self.listeners: callback(msg)
def test_page(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 0, "No of pages is {}".format(no_pages) # Page craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) == 0 # test resource_get_by_absolute_url_and_crawl_id() r1 = delegate.resource_get_by_absolute_url_and_crawl_id( "no such url :p", crawl.id) assert r1 == None # test resource_is_present() present = delegate.resource_is_present('no such url :p', crawl.id) assert not present page = Resource() page.crawl_id = crawl.id page.content = "A long content " + "a" * 1024 * 1024 page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) assert page.id > 0 # test resource_get_by_id() r2 = delegate.resource_get_by_id(page.id) assert r2.id == page.id # test resource_is_present() present = delegate.resource_is_present(page.absolute_url, crawl.id) assert present pages = delegate.resource_get_all() assert len(pages) > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 1, "No of pages is {}".format(no_pages) craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) > 0 r1 = delegate.resource_get_by_absolute_url_and_crawl_id( page.absolute_url, crawl.id) assert r1.id == page.id # # Test cascade delete delegate.crawl_delete_all() pages = delegate.resource_get_all() assert len(pages) == 0, "It should be {} but we found {}".format( 0, len(pages)) # # Clean up delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")