def consume(self, process_id): while True: self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue") if not self.queue.empty(): image_id, link = self.queue.get() self.logger.log("Consumer process:"+ str(process_id) + " start crawling " + str(link)) image = common_utils.page_crawl(link) if image != None: self.logger.log(link + "crawled successfully") self.adapter.store_image(image_id, image) else: self.logger.log(link + " failed at crawling") self.adapter.update_image_status(image_id, ImageIndexStatus.DOWNLOAD_FAILED) self.queue.task_done() time.sleep(1) else: self.logger.log("Queue empty") time.sleep(10)
def run(self): while True: count = 0 try: for url_hash, url in self.adapter.load_uncrawled_docs(BatchCrawler.MAX_DOCS_NUM): count += 1 self.logger.log("crawling url %s"%url, 2) page = common_utils.page_crawl(url) if page == None: self.adapter.update_doc_raw_as_crawled_failed(url_hash) continue if self.encode != "utf-8": page = unicode(page, self.encode).encode("utf-8") self.adapter.update_doc_raw_with_crawled_page(url_hash, "utf-8", page) time.sleep(float(self.request_interval)) if count < BatchCrawler.MAX_DOCS_NUM: break except: self.logger.log("mongo error")
def consume(self, process_id): while True: self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue") if not self.queue.empty(): image_id, link = self.queue.get() self.logger.log("Consumer process:" + str(process_id) + " start crawling " + str(link)) image = common_utils.page_crawl(link) if image != None: self.logger.log(link + "crawled successfully") self.adapter.store_image(image_id, image) else: self.logger.log(link + " failed at crawling") self.adapter.update_image_status( image_id, ImageIndexStatus.DOWNLOAD_FAILED) self.queue.task_done() time.sleep(1) else: self.logger.log("Queue empty") time.sleep(10)
def run(self): while True: count = 0 try: for url_hash, url in self.adapter.load_uncrawled_docs( BatchCrawler.MAX_DOCS_NUM): count += 1 self.logger.log("crawling url %s" % url, 2) page = common_utils.page_crawl(url) if page == None: self.adapter.update_doc_raw_as_crawled_failed(url_hash) continue if self.encode != "utf-8": page = unicode(page, self.encode).encode("utf-8") self.adapter.update_doc_raw_with_crawled_page( url_hash, "utf-8", page) time.sleep(float(self.request_interval)) if count < BatchCrawler.MAX_DOCS_NUM: break except: self.logger.log("mongo error")