def upload(self): upfiles = FileUtility.getfilelist( SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN, const.SCHEDULER_URL_PATH), []) donefiles = [ dfile for dfile in upfiles if dfile.endswith(constant.POST_FILE_SUFFIX) ] return self.downloader.upload(donefiles)
def download(self): files = [] if self.__download__(): Logger.getlogging().debug(self.download_path) time.sleep(3) dfiles = FileUtility.getfilelist(self.download_path, []) for dfile in dfiles: if self.taskname in dfile: files.append(dfile) return files
def getvalidfiles(self, path): temmax = self.findmax() validfiles = [] filelist = FileUtility.getfilelist(path, []) for f in filelist: t = int(re.findall('(\d+)', f)[-1]) if t >= temmax: validfiles.append(f) # 每次只获取前100个PUC文件 return validfiles[:self.maxfilenum]
def recoverfile(self, filename): """""" # 查找,获取backup路径,再恢复到目的目录./data/temp/urls filelist = FileUtility.getfilelist(self.urlbackuppath, []) tempfilepath = os.path.join(self.urlbackuppath, filename) if tempfilepath in filelist: newfilepath = self.renewfilename(tempfilepath) FileUtility.copy(tempfilepath, newfilepath) time.sleep(0.5) if FileUtility.exists(newfilepath): return newfilepath return False
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] Logger.getlogging().debug('Get Valid PUC File From ' + self.download_path) #srclist = self.getvalidfiles(self.download_path) srclist = FileUtility.getfilelist(self.download_path, [])[0:self.maxfilenum] for donefile in srclist: try: if donefile.endswith('done'): Logger.getlogging().info('MOVE {file} TO {path}'.format( file=donefile, path=self.done_file)) FileUtility.move(donefile, self.done_file) binfile = os.path.join(self.done_file, FileUtility.getfilename(donefile)) #FileUtility.copy(donefile, self.cache_path) #binfile = self.cache_path+ FileUtility.getfilename(donefile) #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile): ##备份当天的puc文件 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday)) #FileUtility.move(donefile, self.pucbacktoday) #if FileUtility.exists(donefile): #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile)) #else: #Logger.getlogging().error('File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) try: self.s3puc_dumpurls(jsonfile) time.sleep(0.5) Logger.getlogging().debug( 'Remove {f}'.format(f=jsonfile)) FileUtility.remove(jsonfile) donefile2 = os.path.join( self.done_file, FileUtility.getfilename(donefile)) Logger.getlogging().debug( 'Remove {f}'.format(f=donefile2)) FileUtility.remove(donefile2) except: Logger.printexception() Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) except: Logger.printexception() return files
def findmax(self): filelist = FileUtility.getfilelist(self.pucbackpath, []) tf = {} for f in filelist: t = int(re.findall('(\d+)', f)[-1]) tf[t] = f if not tf: return 0 tm = max(tf.keys()) for f in filelist: t = int(re.findall('(\d+)', f)[-1]) if t < tm: Logger.getlogging().info('REMOVE {file}'.format(file=f)) FileUtility.remove(f) return tm
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] if self.completed(): return files Logger.getlogging().debug(self.download_path) srclist = FileUtility.getfilelist(self.download_path, []) for donefile in srclist: filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) self.download_time = time.time() for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.copy(donefile, self.cache_path) binfile = self.cache_path + FileUtility.getfilename( donefile) if FileUtility.getfilesize( donefile) == FileUtility.getfilesize(binfile): Logger.getlogging().info( 'Remove {file}'.format(file=donefile)) FileUtility.remove(donefile) if FileUtility.exists(donefile): Logger.getlogging().error( 'Remove {file} failed'.format( file=donefile)) else: Logger.getlogging().error( 'File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) uploadtime = self.uploadfile_retranslist[ upfile].start_time if RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT1.format( file=upfile), filename): self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) elif RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT2.format( file=upfile), filename): value = \ RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0] if value[0] == value[1]: self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(jsonfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) # update upload time keys = self.sortkeys() for fl in keys: if self.uploadfile_retranslist[ fl].start_time >= uploadtime: self.uploadfile_retranslist[ fl].start_time = time.time() time.sleep(0.1) break return files
def localls(self, path): return FileUtility.getfilelist(path, [])
def geturlfiles(self): return FileUtility.getfilelist(self.tempurldir, [])
def sshls(self, path): return FileUtility.getfilelist(self.download_path, [])
def upload(self): FileUtility.flush() upfiles = FileUtility.getfilelist( Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH), []) return self.downloader.upload(upfiles)