Пример #1
0
 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)
Пример #2
0
 def preprocess(self, filepath):
     result = False
     context = URLStorage.getfilecontext(FileUtility.getfilename(filepath))
     if context:
         self.conf.setchannel(context.channel)
         if context.channel == SPIDER_CHANNEL_S2:
             self.conf.setquery(context.query)
         else:
             self.conf.setquery('')
         URLStorage.updaterecycle()
         result = True
     return result
Пример #3
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     SpiderReport.puts1url(line)
         if lines > 0:
             FileUtility.copy(s1file, s1tempfile)
             SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines)
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Пример #4
0
 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning('upload failed urls {num}'.format(num=len(urlset)))
     context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error('download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning('retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         StatisticsManager.updateall(-len(urls))
         URLStorage.updaterecycle(context.retry + 1)
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_COMMON)