Exemplo n.º 1
0
 def preprocess(self, filepath):
     result = False
     context = URLStorage.getfilecontext(FileUtility.getfilename(filepath))
     if context:
         self.conf.setchannel(context.channel)
         if context.channel == SPIDER_CHANNEL_S2:
             self.conf.setquery(context.query)
         else:
             self.conf.setquery('')
         URLStorage.updaterecycle()
         result = True
     return result
Exemplo n.º 2
0
 def processfile(self, jsonfile):
     if not self.preprocess(jsonfile):
         return
     post = (constant.POST_FILE_SUFFIX in jsonfile)
     urls = self.backupfile(jsonfile)
     context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile))
     with open(jsonfile, 'r') as fp:
         lines = fp.readlines()
     for line in lines:
         param = self.analysis(line, post)
         if param is None:
             continue
         url = param.url
         if context.retry >= 2:
             param.lastretry = True
         if post:
             url = json.dumps({'url': param.url, 'data': param.data})
         else:
             Logger.getlogging().warning(url)
         info = None
         if URLStorage.hasurl(url):
             info = URLStorage.geturlcontext(url)
             param.originalurl = info.originalurl
             param.step = info.step
             param.customized = info.customized
         else:
             param.originalurl = param.url
         res = True
         if SiteS2Query.REFER_URL in param.customized:
             site = self.factory.getsite(param.customized[SiteS2Query.REFER_URL])
             res = site.process(param)
         else:
             site = self.factory.getsite(param.originalurl)
             res = site.process(param)
         if not res:
             if info:
                 URLStorage.seturlcontext(param.url, info)
         else:
             if url in urls:
                 urls[url] -= 1
                 if urls[url] == 0:
                     urls.pop(url)
     # upload failed urls
     if urls:
         self.retrydownload(jsonfile, urls)
Exemplo n.º 3
0
 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning('upload failed urls {num}'.format(num=len(urlset)))
     context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error('download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning('retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         StatisticsManager.updateall(-len(urls))
         URLStorage.updaterecycle(context.retry + 1)
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_COMMON)