示例#1
0
文件: atlas.py 项目: pk13610/spiders
 def _get_metadata(self, arg):
     if self.event.isSet(): return
     url = arg['url']
     referer = arg['referer']
     path = arg['path']
     fp = os.path.join(path, CONFIG.name_format%(hash(url)))
     util.asure_path(os.path.dirname(fp))
     handle = http.DownloadStreamHandler(open(fp, 'w'))
     for kk in [1,2,3]:
         try:
             html = http.HttpUtil(proxy=CONFIG.proxy)
             html.add_header('Referer', referer)
             html.fetch(url, handle)
             break
         except Exception as e:
             LOG.exception(e)
             time.sleep(3)
示例#2
0
文件: atlas.py 项目: pk13610/spiders
 def get_all(self):
     LOG.info("== start all ==")
     curr_items = []
     util.asure_path(CONFIG.save_path)
     achived_tasks = os.listdir(CONFIG.save_path)
     for id in xrange(CONFIG.deapth):
         try:
             if self.event.isSet(): break
             LOG.info("==> deapth %d", id)
             if id == 0:
                 curr_items = self._get_index_page()
             else:
                 curr_items = self._get_index_page(id, curr_items[:1][0][0])
             for item in curr_items:
                 if item[0] not in achived_tasks:
                     self.get_data(self._get_metadata_url(item[1], item[0]))
         except Exception as e:
             LOG.exception(e)
         LOG.info("<== deapth %d", id)
     LOG.info("== end all ==")