def _get_metadata(self, arg): if self.event.isSet(): return url = arg['url'] referer = arg['referer'] path = arg['path'] fp = os.path.join(path, CONFIG.name_format%(hash(url))) util.asure_path(os.path.dirname(fp)) handle = http.DownloadStreamHandler(open(fp, 'w')) for kk in [1,2,3]: try: html = http.HttpUtil(proxy=CONFIG.proxy) html.add_header('Referer', referer) html.fetch(url, handle) break except Exception as e: LOG.exception(e) time.sleep(3)
def get_all(self): LOG.info("== start all ==") curr_items = [] util.asure_path(CONFIG.save_path) achived_tasks = os.listdir(CONFIG.save_path) for id in xrange(CONFIG.deapth): try: if self.event.isSet(): break LOG.info("==> deapth %d", id) if id == 0: curr_items = self._get_index_page() else: curr_items = self._get_index_page(id, curr_items[:1][0][0]) for item in curr_items: if item[0] not in achived_tasks: self.get_data(self._get_metadata_url(item[1], item[0])) except Exception as e: LOG.exception(e) LOG.info("<== deapth %d", id) LOG.info("== end all ==")