def __upload__(self, filepath): flag = True FileUtility.mkdirs(self.urlbackuppath) FileUtility.copy(filepath, self.urlbackuppath) self.upload_file_list[FileUtility.getfilename(filepath)] = [] # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES: # if self.limpls: # if self.limplsindex >= len(self.limpls): # self.limplsindex = 0 # flag = self.limpls[self.limplsindex].upload(filepath) # self.limplsindex += 1 if filepath.endswith(constant.WEBKIT_FILE_SUFFIX): if self.wimpls: if self.wimplsindoex >= len(self.wimpls): self.wimplsindoex = 0 self.wimpls[self.wimplsindoex].upload(filepath) self.wimplsindoex += 1 elif self.impls: if self.implsindex >= len(self.impls): self.implsindex = 0 flag = self.impls[self.implsindex].upload(filepath) self.implsindex += 1 else: flag = False Logger.getlogging().warning('No taskid or download platform!') return flag
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.removefiles(donepath) backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate()) interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) while True: Logger.getlogging().debug('scanning') flag = False for filename in os.listdir(scanningPath): try: urlfilepath = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(urlfilepath) and 'tmp' not in filename: Logger.getlogging().info('Get url file:{file}'.format(file=filename)) FileUtility.copy(urlfilepath, backupfile) download(urlfilepath) if not flag: flag = True except: Logger.printexception() if not flag: Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval)) time.sleep(int(interval))
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def upload(self, path): tencentplatform.postdownloader.PostDownloader.upload(self, path) filename = FileUtility.getfilename(path) FileUtility.mkdirs(self.download_path) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def recoverfile(self, filename): """""" # 查找,获取backup路径,再恢复到目的目录./data/temp/urls filelist = FileUtility.getfilelist(self.urlbackuppath, []) tempfilepath = os.path.join(self.urlbackuppath, filename) if tempfilepath in filelist: newfilepath = self.renewfilename(tempfilepath) FileUtility.copy(tempfilepath, newfilepath) time.sleep(0.5) if FileUtility.exists(newfilepath): return newfilepath return False
def upload(self, path): tencentplatform.tencentdownloader.TencentDownloader.upload(self, path) filename = FileUtility.getfilename(path) ts = int(time.time()) FileUtility.mkdirs(self.download_path) Logger.getlogging( ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format( dir=self.download_path, filename=filename, ts=int(time.time()))) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: rows = [] for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning( 'Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 rows.append(line) if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if rows: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def bin2json(self, file): filename = FileUtility.getfilename(file).replace('.done', '.json') fullpath = self.info.jsonpath + filename FileUtility.copy(file, fullpath) return fullpath
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] if self.completed(): return files Logger.getlogging().debug(self.download_path) srclist = FileUtility.getfilelist(self.download_path, []) for donefile in srclist: filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) self.download_time = time.time() for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.copy(donefile, self.cache_path) binfile = self.cache_path + FileUtility.getfilename( donefile) if FileUtility.getfilesize( donefile) == FileUtility.getfilesize(binfile): Logger.getlogging().info( 'Remove {file}'.format(file=donefile)) FileUtility.remove(donefile) if FileUtility.exists(donefile): Logger.getlogging().error( 'Remove {file} failed'.format( file=donefile)) else: Logger.getlogging().error( 'File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) uploadtime = self.uploadfile_retranslist[ upfile].start_time if RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT1.format( file=upfile), filename): self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) elif RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT2.format( file=upfile), filename): value = \ RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0] if value[0] == value[1]: self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(jsonfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) # update upload time keys = self.sortkeys() for fl in keys: if self.uploadfile_retranslist[ fl].start_time >= uploadtime: self.uploadfile_retranslist[ fl].start_time = time.time() time.sleep(0.1) break return files
def bin2json(self, file): jsonfile = tencentplatform.tencentdownloader.TencentDownloader.bin2json( self, file) FileUtility.copy(file, jsonfile) return jsonfile
def localupload(self,path): srcfilepath = os.path.join(self.info.urlpath, FileUtility.getfilename(path)) FileUtility.copy(path, srcfilepath) #autodownloader.download(path) return True
def sshdownload(self, donefile): Logger.getlogging().info('sshdownload:' + donefile) FileUtility.copy(donefile, self.info.localdonepath)