def parse(self, url, html): """ 解析html页面 """ parser = NormalParser(url, html, self.job) item = parser.parse() # 分片写入 item['ram'] = random.random() new_urls = item.get('links') # 抓去的新链接判重后加入队列 for new_url in new_urls: if not self.duplicate_filter.exists(new_url): self.spider_queue.push({ "url": new_url, "life": 5 }) # url原始解析结果持久化 item = self.storage_pipline.insert(self.config.get("page_table"), item) self.processer_queue.push(item.get('_id')) self._update_status(True) log("[SUCCESS] %s." % url)
def parse(self, url, html): parser = NormalParser(url, html, self.job) return parser.parse()
def run(self, job): """ 执行方法 :param job: 任务信息 :return: """ # 注册爬虫 crawler = PhantomCrawler() parser = NormalParser(job) if len(self.spider_queue) > 0: task = eval(self.spider_queue.pop()) self.task = task # 若该任务失败次数过多,不再处理该任务 if task['life'] == 0: return response = crawler.fetch(task['url']) # success, result = crawler.fetch(task['url']) # 若爬虫成功爬取 if response['status_code'] == 200: try: item = parser.parse(task['url'], response['content']) # 分片写入 item['ram'] = random.random() new_urls = item['links'] # 抓去的新链接判重后加入队列 for new_url in new_urls: if not self.duplicate_filter.exists(new_url): self.spider_queue.push({ "url": new_url, "life": 5 }) # url原始解析结果持久化 item = self.storage_pipline.insert(self.config.get("page_table"), item) self.processer_queue.push(item.get('_id')) # 更新任务状态 self._update_status(True) log("[SUCCESS] %s." % task['url']) except Exception, e: # 将失败的url再次放入队列 self.spider_queue.push({ "url": task['url'], "life": task['life'] - 1 }) log("[FAILED] %s %s" % (task['url'], e)) else: # 更新任务状态 self._update_status(False) # 将失败的url再次放入队列 self.spider_queue.push({ "url": task['url'], "life": task['life'] - 1 }) log("[FAILED] %s %s" % (task['url'], response['status_code']))