self.q.put_nowait(url) async def crawl(self): workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() await self.q.join() self.t1 = time.time() for w in workers: w.cancel() if __name__ == '__main__': logger = create_logging('链家URL', 'logurl.log') write_header_csv() # 琼海没有信息 # 苏州没有信息 # 石家庄没有信息 # 沈阳没有信息 # 三亚没有信息 # 文昌没有信息 # 万宁没有信息 # 海口没有信息 # 西安没有信息 # 陵水没有信息 # 廊坊燕郊没有信息 URLs = [ 'http://bj.lianjia.com/zufang/pg{}/', 'http://nj.lianjia.com/zufang/pg{}/',
self.q.task_done() def add_url(self, url): if url not in self.seen_urls: self.seen_urls.add(url) self.q.put_nowait(url) async def crawl(self): workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() await self.q.join() self.t1 = time.time() for w in workers: w.cancel() if __name__ == '__main__': logger = create_logging('豆瓣list', 'loggerlist.log') URL = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' loop = asyncio.get_event_loop() crawler = Crawler(max_tasks=10) crawler.add_url(URL) loop.run_until_complete(crawler.crawl()) print('Finished in {:.3f} seconds'.format(crawler.t1 - crawler.t0)) print('一共抓取网页--->', len(crawler.seen_urls)) crawler.close() loop.close()
await self.fetch(msg.body) msg.ack() self.queue.task_done() except asyncio.CancelledError: pass except Exception as e: self.queue.task_done() raise def run(self): reconnect_task = self.loop.create_task(reconnector(self.queue, 'tags')) process_task = [ self.loop.create_task(self.process_msgs(self.queue)) for _ in range(self.max_tasks) ] try: self.loop.run_forever() except KeyboardInterrupt: process_task.cancel() reconnect_task.cancel() self.loop.run_until_complete(process_task) self.loop.run_until_complete(reconnect_task) self.loop.close() if __name__ == "__main__": crawl = Crawler(max_tasks=1) logger = create_logging('豆瓣tag', 'loggertag.log') crawl.run()
msg = await queue.get() await self.fetch(msg.body) msg.ack() # asyncio.sleep(5) except asyncio.CancelledError: pass except Exception as e: self.q.task_done() def run(self): reconnect_task = self.loop.create_task(reconnector(self.queue)) process_task = [ self.loop.create_task(self.process_msgs(self.queue)) for _ in range(self.max_tasks) ] try: self.loop.run_forever() except KeyboardInterrupt: process_task.cancel() reconnect_task.cancel() self.loop.run_until_complete(process_task) self.loop.run_until_complete(reconnect_task) self.loop.close() if __name__ == "__main__": logger = create_logging('链家ITEM', 'logitem.log') crawl = CrawlItem(max_tasks=10) crawl.run()
msg.ack() self.queue.task_done() except asyncio.CancelledError: pass except Exception as e: self.queue.task_done() raise def run(self): reconnect_task = self.loop.create_task(reconnector( self.queue, 'items')) process_task = [ self.loop.create_task(self.process_msgs(self.queue)) for _ in range(self.max_tasks) ] try: self.loop.run_forever() except KeyboardInterrupt: process_task.cancel() reconnect_task.cancel() self.loop.run_until_complete(process_task) self.loop.run_until_complete(reconnect_task) self.loop.close() if __name__ == "__main__": crawl = Crawler(max_tasks=10) logger = create_logging('豆瓣item', 'loggeritem.log') crawl.run()