Exemplo n.º 1
0
 def upload(self):
     upfiles = FileUtility.getfilelist(
         SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN,
                                   const.SCHEDULER_URL_PATH), [])
     donefiles = [
         dfile for dfile in upfiles
         if dfile.endswith(constant.POST_FILE_SUFFIX)
     ]
     return self.downloader.upload(donefiles)
 def download(self):
     files = []
     if self.__download__():
         Logger.getlogging().debug(self.download_path)
         time.sleep(3)
         dfiles = FileUtility.getfilelist(self.download_path, [])
         for dfile in dfiles:
             if self.taskname in dfile:
                 files.append(dfile)
     return files
Exemplo n.º 3
0
    def getvalidfiles(self, path):
        temmax = self.findmax()
        validfiles = []
        filelist = FileUtility.getfilelist(path, [])

        for f in filelist:
            t = int(re.findall('(\d+)', f)[-1])
            if t >= temmax:
                validfiles.append(f)
        # 每次只获取前100个PUC文件
        return validfiles[:self.maxfilenum]
Exemplo n.º 4
0
 def recoverfile(self, filename):
     """"""
     # 查找,获取backup路径,再恢复到目的目录./data/temp/urls
     filelist = FileUtility.getfilelist(self.urlbackuppath, [])
     tempfilepath = os.path.join(self.urlbackuppath, filename)
     if tempfilepath in filelist:
         newfilepath = self.renewfilename(tempfilepath)
         FileUtility.copy(tempfilepath, newfilepath)
         time.sleep(0.5)
         if FileUtility.exists(newfilepath):
             return newfilepath
     return False
Exemplo n.º 5
0
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     Logger.getlogging().debug('Get Valid PUC File From ' +
                               self.download_path)
     #srclist = self.getvalidfiles(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path,
                                       [])[0:self.maxfilenum]
     for donefile in srclist:
         try:
             if donefile.endswith('done'):
                 Logger.getlogging().info('MOVE {file} TO {path}'.format(
                     file=donefile, path=self.done_file))
                 FileUtility.move(donefile, self.done_file)
                 binfile = os.path.join(self.done_file,
                                        FileUtility.getfilename(donefile))
                 #FileUtility.copy(donefile, self.cache_path)
                 #binfile = self.cache_path+ FileUtility.getfilename(donefile)
                 #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile):
                 ##备份当天的puc文件
                 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday))
                 #FileUtility.move(donefile, self.pucbacktoday)
                 #if FileUtility.exists(donefile):
                 #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile))
                 #else:
                 #Logger.getlogging().error('File not equal {file}'.format(file=donefile))
                 jsonfile = self.bin2json(binfile)
                 files.append(jsonfile)
                 try:
                     self.s3puc_dumpurls(jsonfile)
                     time.sleep(0.5)
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=jsonfile))
                     FileUtility.remove(jsonfile)
                     donefile2 = os.path.join(
                         self.done_file, FileUtility.getfilename(donefile))
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=donefile2))
                     FileUtility.remove(donefile2)
                 except:
                     Logger.printexception()
                     Logger.getlogging().error(
                         'no json file generate from done file:{done}'.
                         format(done=binfile))
                     os.mknod(jsonfile)
         except:
             Logger.printexception()
     return files
Exemplo n.º 6
0
 def findmax(self):
     filelist = FileUtility.getfilelist(self.pucbackpath, [])
     tf = {}
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         tf[t] = f
     if not tf:
         return 0
     tm = max(tf.keys())
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         if t < tm:
             Logger.getlogging().info('REMOVE {file}'.format(file=f))
             FileUtility.remove(f)
     return tm
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path, [])
     for donefile in srclist:
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             self.download_time = time.time()
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.copy(donefile, self.cache_path)
                     binfile = self.cache_path + FileUtility.getfilename(
                         donefile)
                     if FileUtility.getfilesize(
                             donefile) == FileUtility.getfilesize(binfile):
                         Logger.getlogging().info(
                             'Remove {file}'.format(file=donefile))
                         FileUtility.remove(donefile)
                         if FileUtility.exists(donefile):
                             Logger.getlogging().error(
                                 'Remove {file} failed'.format(
                                     file=donefile))
                     else:
                         Logger.getlogging().error(
                             'File not equal {file}'.format(file=donefile))
                     jsonfile = self.bin2json(binfile)
                     files.append(jsonfile)
                     uploadtime = self.uploadfile_retranslist[
                         upfile].start_time
                     if RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT1.format(
                                 file=upfile), filename):
                         self.upload_file_list.pop(upfile)
                         self.uploadfile_retranslist.pop(upfile)
                     elif RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT2.format(
                                 file=upfile), filename):
                         value = \
                         RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0]
                         if value[0] == value[1]:
                             self.upload_file_list.pop(upfile)
                             self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(jsonfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=binfile))
                         os.mknod(jsonfile)
                     # update upload time
                     keys = self.sortkeys()
                     for fl in keys:
                         if self.uploadfile_retranslist[
                                 fl].start_time >= uploadtime:
                             self.uploadfile_retranslist[
                                 fl].start_time = time.time()
                             time.sleep(0.1)
                     break
     return files
 def localls(self, path):
     return FileUtility.getfilelist(path, [])
Exemplo n.º 9
0
 def geturlfiles(self):
     return FileUtility.getfilelist(self.tempurldir, [])
 def sshls(self, path):
     return FileUtility.getfilelist(self.download_path, [])
Exemplo n.º 11
0
 def upload(self):
     FileUtility.flush()
     upfiles = FileUtility.getfilelist(
         Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH), [])
     return self.downloader.upload(upfiles)