Пример #1
0
 def crawl(self):
     """
         使用递归,需要解决最大递归深度的问题
         RuntimeError: maximum recursion depth exceeded while calling a Python object
     """
     print "start crawling url: %s" % self.start_url
     raw_html_data = downloader.read_page(url=self.start_url)
     if not raw_html_data:
         return 1
     urlmanager.update_url(url=self.start_url, status=1)
     pic_links = parser.get_pic_link(raw_html_data)
     next_link = parser.get_next_link(raw_html_data)
     for pic_link in pic_links:
         print "get picture link: %s" % pic_link
         self.PicMgrObj.download_picture(pic_link)
         file_name, postfix = self.PicMgrObj.extract_pic_name(pic_link)
         desc = "查看描述请前往:%s" % self.start_url
         self.PicMgrObj.save_desc(file_name=file_name, desc=desc)
     urlmanager.update_url(url=next_link)
     non_visited_urls = urlmanager.get_non_visited_urls()
     while non_visited_urls:  # 只要有没被访问过的url, 就一直递归爬取下去
         for url in non_visited_urls:
             self.start_url = url
             self.crawl()
     return 0
Пример #2
0
 def crawl(self):
     """
         使用递归,需要解决最大递归深度的问题
         RuntimeError: maximum recursion depth exceeded while calling a Python object
     """
     print "start crawling url: %s" % self.start_url
     raw_html_data = downloader.read_page(url=self.start_url)
     if not raw_html_data:
         return 1
     urlmanager.update_url(url=self.start_url, status=1)
     pic_links = parser.get_pic_link(raw_html_data)
     next_link = parser.get_next_link(raw_html_data)
     for pic_link in pic_links:
         print "get picture link: %s" % pic_link
         self.PicMgrObj.download_picture(pic_link)
         file_name, postfix = self.PicMgrObj.extract_pic_name(pic_link)
         desc = "查看描述请前往:%s" % self.start_url
         self.PicMgrObj.save_desc(file_name=file_name, desc=desc)
     urlmanager.update_url(url=next_link)
     non_visited_urls = urlmanager.get_non_visited_urls()
     while non_visited_urls:  # 只要有没被访问过的url, 就一直递归爬取下去
         for url in non_visited_urls:
             self.start_url = url
             self.crawl()
     return 0
Пример #3
0
 def test_update_url(self):
     url = "http://bbs.fudan.edu.cn/bbs/tcon?new=1&bid=120&f=3046325721854116981"
     status = 0
     res = update_url(url, status)
     print res