def readFile(urlpath, filename): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) donepath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) writeTmpfile = donepath + filename + '.tmp' now = str(time.time()).split('.')[0] writefile = donepath + filename + '.txt.' + now + '.done' if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) Logger.getlogging().debug('post_done start:{f}'.format(f=writefile)) with open(urlpath, 'r') as fp: lines = fp.readlines() os.mknod(writeTmpfile) for line in lines: jsonLine = json.loads(line) try: jsonStr = downPost(jsonLine) with open(writeTmpfile, 'a+') as filetemp: filetemp.write(jsonStr + '\n') Logger.getlogging().debug( '{url}:Post request sucessed'.format(url=jsonLine['url'])) except: Logger.getlogging().warning( '{url}:Post request failed'.format(url=jsonLine['url'])) Logger.printexception() if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('post_done end:{f}'.format(f=writefile)) FileUtility.remove(urlpath)
def download(self): files = [] if self.completed(): return files Logger.getlogging().debug(self.info.donepath) srclist = self.sshls(self.info.donepath) for donefile in srclist: donefile = donefile.strip() filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.mkdirs(self.info.localdonepath) self.sshdownload(donefile) dfile = self.info.localdonepath + FileUtility.getfilename( donefile) if self.info.jsonpath: dfile = self.bin2json(dfile) files.append(dfile) self.download_time = int(time.time()) self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(dfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=dfile)) break return files
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.removefiles(donepath) backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate()) interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) while True: Logger.getlogging().debug('scanning') flag = False for filename in os.listdir(scanningPath): try: urlfilepath = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(urlfilepath) and 'tmp' not in filename: Logger.getlogging().info('Get url file:{file}'.format(file=filename)) FileUtility.copy(urlfilepath, backupfile) download(urlfilepath) if not flag: flag = True except: Logger.printexception() if not flag: Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval)) time.sleep(int(interval))
def __upload__(self, filepath): flag = True FileUtility.mkdirs(self.urlbackuppath) FileUtility.copy(filepath, self.urlbackuppath) self.upload_file_list[FileUtility.getfilename(filepath)] = [] # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES: # if self.limpls: # if self.limplsindex >= len(self.limpls): # self.limplsindex = 0 # flag = self.limpls[self.limplsindex].upload(filepath) # self.limplsindex += 1 if filepath.endswith(constant.WEBKIT_FILE_SUFFIX): if self.wimpls: if self.wimplsindoex >= len(self.wimpls): self.wimplsindoex = 0 self.wimpls[self.wimplsindoex].upload(filepath) self.wimplsindoex += 1 elif self.impls: if self.implsindex >= len(self.impls): self.implsindex = 0 flag = self.impls[self.implsindex].upload(filepath) self.implsindex += 1 else: flag = False Logger.getlogging().warning('No taskid or download platform!') return flag
def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def upload(self, path): tencentplatform.postdownloader.PostDownloader.upload(self, path) filename = FileUtility.getfilename(path) FileUtility.mkdirs(self.download_path) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def waibuetl(self): waibubackup = SpiderConfigure.getwaibubaup() if not FileUtility.exists(waibubackup): FileUtility.mkdirs(waibubackup) waibufile = self.etl.getqueryfromdb() if not FileUtility.exists(waibufile): Logger.getlogging().warning( '{waibufile} not generate!'.format(waibufile=waibufile)) return outtime = 0 self.wdownloader.upload(waibufile) continueflag = True while continueflag: downloadfiles = [] while True: Logger.getlogging().info( 'sleeping {sec}s......'.format(sec=self.waitingperiod)) #time.sleep(self.waitingperiod) outtime += self.waitingperiod if self.wdownloader.iscompleted(): continueflag = False break try: downloadfiles = self.wdownloader.download() if downloadfiles: break except: Logger.printexception() if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.wb_analysis(dfile) #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)): #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile)) FileUtility.move(dfile, waibubackup) logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break
def upload(self, path): tencentplatform.tencentdownloader.TencentDownloader.upload(self, path) filename = FileUtility.getfilename(path) ts = int(time.time()) FileUtility.mkdirs(self.download_path) Logger.getlogging( ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format( dir=self.download_path, filename=filename, ts=int(time.time()))) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def __init__(self): self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {}, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}} self.url_beforenewsnum_map = {} self.url_curcmtcontent_map = {} self.url_curcmtnum_map = {} self.url_beforecmtnum_map = {} date = TimeUtility.getcurrentdate() path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date) suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX) self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date) if not FileUtility.exists(path): FileUtility.mkdirs(path)
def download(self): doneurl = TencentDownloader.DONE_FILE_URL.format( taskid=self.taskinfo.taskid) html = TencentDownloader.httpget(doneurl) if html: xparse = XPathUtility(html) for donefile in xparse.getlist(r'//tr/td[2]/a'): if donefile.endswith( 'done') and donefile not in self.downloadedfiles: for upfile in self.upload_file_list: if donefile.startswith(upfile): FileUtility.mkdirs(self.download_path) self.execute( TencentDownloader.DOWNLOAD_COMMAND.format( taskid=self.taskinfo.taskid, filename=donefile)) FileUtility.move('./' + donefile, self.download_path) break self.downloadedfiles.append(donefile) return tencentdownloader.TencentDownloader.download(self)
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) scanningPath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) backupPath = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.DOWNLOADER_URL_BACKUP) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) flag = False for filename in os.listdir(scanningPath): fp = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(fp) and 'tmp' not in filename: Logger.getlogging().info( 'Get url file:{file}'.format(file=filename)) FileUtility.move(fp, backupfile) readFile(backupfile, filename) if not flag: flag = True if not flag: time.sleep(10)
def mkcachedir(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) FileUtility.rmdir(cache) FileUtility.mkdirs(cache) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH)) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH) if FileUtility.exists(outputpath): validdate = TimeUtility.getuniformdatebefore(limit) for s in os.listdir(outputpath): if s < validdate: fullpath = os.path.join(outputpath, s) FileUtility.rmdir(fullpath)
def __init__(self): self.downloader = SchedulDownloader() self.urlpath = SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN, const.SCHEDULER_URL_PATH) FileUtility.mkdirs(self.urlpath)