def storeurls(self, urls, request=constant.REQUEST_TYPE_COMMON): urlfile = URLFileManager.getinstance().geturlfilepath(request) if FileUtility.geturlfilelines(urlfile) + len( urls) > URLFileManager.URL_FILE_LINES_MAX_NUMBER: URLFileManager.getinstance().generateurlfilepath() urlfile = URLFileManager.getinstance().geturlfilepath(request) FileUtility.writelines(urlfile, urls)
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def __upload__(self, filepath): flag = True FileUtility.mkdirs(self.urlbackuppath) FileUtility.copy(filepath, self.urlbackuppath) self.upload_file_list[FileUtility.getfilename(filepath)] = [] # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES: # if self.limpls: # if self.limplsindex >= len(self.limpls): # self.limplsindex = 0 # flag = self.limpls[self.limplsindex].upload(filepath) # self.limplsindex += 1 if filepath.endswith(constant.WEBKIT_FILE_SUFFIX): if self.wimpls: if self.wimplsindoex >= len(self.wimpls): self.wimplsindoex = 0 self.wimpls[self.wimplsindoex].upload(filepath) self.wimplsindoex += 1 elif self.impls: if self.implsindex >= len(self.impls): self.implsindex = 0 flag = self.impls[self.implsindex].upload(filepath) self.implsindex += 1 else: flag = False Logger.getlogging().warning('No taskid or download platform!') return flag
def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def upload(self, path): retans = RetransInfo() retans.filename = FileUtility.getfilename(path) if int(self.start_time) == int(time.time()): time.sleep(0.1) self.start_time = time.time() retans.start_time = self.start_time self.uploadfile_retranslist[retans.filename] = retans self.upload_file_list[FileUtility.getfilename(path)] = [] cmd = TencentDownloader.UPLOAD_COMMAND.format( file=path, url=self.upload_url, user_id=self.taskinfo.userid, task_name=self.taskinfo.taskname, task_id=self.taskinfo.taskid) if self.execute(cmd): return True secs = 10 for count in range(0, self.retrytimes): time.sleep(secs) secs *= 2 if self.execute(cmd): return True else: param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format( file=FileUtility.getfilename(path), taskid=self.taskinfo.taskid) SpiderNotify.notify(param) return False
def readFile(urlpath, filename): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) donepath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) writeTmpfile = donepath + filename + '.tmp' now = str(time.time()).split('.')[0] writefile = donepath + filename + '.txt.' + now + '.done' if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) Logger.getlogging().debug('post_done start:{f}'.format(f=writefile)) with open(urlpath, 'r') as fp: lines = fp.readlines() os.mknod(writeTmpfile) for line in lines: jsonLine = json.loads(line) try: jsonStr = downPost(jsonLine) with open(writeTmpfile, 'a+') as filetemp: filetemp.write(jsonStr + '\n') Logger.getlogging().debug( '{url}:Post request sucessed'.format(url=jsonLine['url'])) except: Logger.getlogging().warning( '{url}:Post request failed'.format(url=jsonLine['url'])) Logger.printexception() if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('post_done end:{f}'.format(f=writefile)) FileUtility.remove(urlpath)
def download(self): files = [] if self.completed(): return files Logger.getlogging().debug(self.info.donepath) srclist = self.sshls(self.info.donepath) for donefile in srclist: donefile = donefile.strip() filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.mkdirs(self.info.localdonepath) self.sshdownload(donefile) dfile = self.info.localdonepath + FileUtility.getfilename( donefile) if self.info.jsonpath: dfile = self.bin2json(dfile) files.append(dfile) self.download_time = int(time.time()) self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(dfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=dfile)) break return files
def upload(self, path): retans = RetransInfo() retans.filename = FileUtility.getfilename(path) retans.start_time = int(time.time()) self.uploadfile_retranslist[retans.filename] = retans self.upload_file_list[FileUtility.getfilename(path)] = [] if self.localupload(path): return True
def getqueryfromdb(self): #指定s2 query输出文件路径 s2file = SpiderConfigure.getinstance().gets2file() temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) QueryStorage.getinstance().getlocalquerys( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def upload(self, path): tencentplatform.postdownloader.PostDownloader.upload(self, path) filename = FileUtility.getfilename(path) FileUtility.mkdirs(self.download_path) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def gettiebaqueryfromdb(self): #指定s2 query输出文件路径 tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) temppath = Storage.getstoragelocation( const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile) QueryStorage.getinstance().getlocalquerys_tieba( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def recoverfile(self, filename): """""" # 查找,获取backup路径,再恢复到目的目录./data/temp/urls filelist = FileUtility.getfilelist(self.urlbackuppath, []) tempfilepath = os.path.join(self.urlbackuppath, filename) if tempfilepath in filelist: newfilepath = self.renewfilename(tempfilepath) FileUtility.copy(tempfilepath, newfilepath) time.sleep(0.5) if FileUtility.exists(newfilepath): return newfilepath return False
def storeurl(self, url, urlcontext, request=constant.REQUEST_TYPE_COMMON): if url.strip(): urlfile = URLFileManager.getinstance().geturlfilepath(request) if FileUtility.geturlfilelines( urlfile) + 1 > URLFileManager.URL_FILE_LINES_MAX_NUMBER: URLFileManager.getinstance().generateurlfilepath() urlfile = URLFileManager.getinstance().geturlfilepath(request) FileUtility.writeline(urlfile, url) key = Common.md5(url.strip()) if key not in self.urlcontextdict: self.urlcontextdict[key] = [] self.urlcontextdict[key].append(urlcontext)
def sshdownload(host, port, username, pwd, targetFilePath, localPath): Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath)) ssh = SSHConnection(host, port, username, pwd) if ssh.connect(): length = len(targetFilePath.split('/')) fileName = targetFilePath.split('/')[length - 1] ssh.download(targetFilePath, localPath + fileName + '.tmp') ssh.close() FileUtility.move(localPath + fileName + '.tmp', localPath + fileName) return True else: return False
def __download__(self, downloaderlist): valid_json_files = [] for impl in downloaderlist: json_files = impl.download() for dfile in json_files: for ufile in self.upload_file_list.keys(): if RegexUtility.match( Downloader.DOWNLOAD_FORMAT1.format(file=ufile), dfile): self.upload_file_list.pop(ufile) if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) elif RegexUtility.match( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile): value = RegexUtility.parse( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile)[0] if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) if value[0] == value[1]: self.upload_file_list.pop(ufile) retransmissionfiles = impl.outtimefiles() for fl in retransmissionfiles.keys(): # 下载异常 if fl not in self.all_retransmissionfiles: self.all_retransmissionfiles[fl] = retransmissionfiles[fl] self.all_retransmissionfiles[fl].retrans_num += 1 self.all_retransmissionfiles[fl].taskinfo = impl self.retransmissionfiles[fl] = self.all_retransmissionfiles[fl] if self.retransmissionfiles[ fl].retrans_num <= self.retransmissionlimitnum: # 虽然下载失败了,但假装已下载,故在upload_file_list删除 self.upload_file_list.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:{num}th fail'.format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num)) else: # 虽然下载失败了,但假装已下载,故在upload_file_list删除;不再重传,在重传列表中删除 self.upload_file_list.pop(fl) self.retransmissionfiles.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:more then {num}th fail'. format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num - 1)) return valid_json_files
def s2queryurl(query, website, url, onlywrite=False): sitename = str(website) if '.' in sitename: sitename = sitename[sitename.rindex('.') + 1:] if not onlywrite: SpiderReport.removequerysite(query, sitename) SpiderReport.getinstance().s2urlsitemap[Common.md5(url.strip())] = sitename SpiderReport.updates2site(query, sitename, SpiderReport.URL_UPLOAD, 1) FileUtility.writeline(SpiderReport.getinstance().s2urlfilepath, SpiderReport.S2URL_FORMAT.format( query=query, website=sitename, url=url ))
def __init__(self): self.reportlist = {} self.s2sitereportlist = {} self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S2_QUERY_URLS_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(self.s2urlfilepath) self.totalreport = Report() self.totalreport.channel = 'SUM' self.s1urls = [] self.querysitesmap = {} self.s2sitenum = 0 self.s2urlsitemap = {}
def upload(self, path): tencentplatform.tencentdownloader.TencentDownloader.upload(self, path) filename = FileUtility.getfilename(path) ts = int(time.time()) FileUtility.mkdirs(self.download_path) Logger.getlogging( ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format( dir=self.download_path, filename=filename, ts=int(time.time()))) FileUtility.copy( path, '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path, filename=filename, ts=int(time.time()))) return True
def s2upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) for line in lines: try: query = line.strip() self.conf.setchannel(SPIDER_CHANNEL_S2) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() allsite = self.factory.getall() for site in allsite: site.s2query(query) except: Logger.printexception()
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.removefiles(donepath) backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate()) interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) while True: Logger.getlogging().debug('scanning') flag = False for filename in os.listdir(scanningPath): try: urlfilepath = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(urlfilepath) and 'tmp' not in filename: Logger.getlogging().info('Get url file:{file}'.format(file=filename)) FileUtility.copy(urlfilepath, backupfile) download(urlfilepath) if not flag: flag = True except: Logger.printexception() if not flag: Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval)) time.sleep(int(interval))
def upload(self, path): retans = RetransInfo() retans.filename = FileUtility.getfilename(path) retans.start_time = int(time.time()) self.uploadfile_retranslist[retans.filename] = retans self.upload_file_list[FileUtility.getfilename(path)] = [] if self.sshupload(path): return True else: param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format( file=FileUtility.getfilename(path), taskid=self.info.ip) SpiderNotify.notify(param) return False
def findmax(self): filelist = FileUtility.getfilelist(self.pucbackpath, []) tf = {} for f in filelist: t = int(re.findall('(\d+)', f)[-1]) tf[t] = f if not tf: return 0 tm = max(tf.keys()) for f in filelist: t = int(re.findall('(\d+)', f)[-1]) if t < tm: Logger.getlogging().info('REMOVE {file}'.format(file=f)) FileUtility.remove(f) return tm
def backupfile(self, jsonfile): urlmap = {} splitkey = '.' if '_split' in jsonfile: splitkey = '_split' bkfile = self.urlbackuppath + '/' + FileUtility.getfilename(jsonfile).split(splitkey)[0] if FileUtility.exists(bkfile): with open(bkfile, 'r') as bkfh: for line in bkfh.readlines(): line = line.strip() if line in urlmap: urlmap[line] += 1 else: urlmap[line] = 1 return urlmap
def upload(self, upfiles): Logger.getlogging().debug('uploading ......') for file in upfiles: if self.emptyfile(file): Logger.getlogging().info('remove empty file: ' + file) FileUtility.remove(file) continue if not self.__upload__(file): Logger.log(FileUtility.getfilename(file), constant.ERRORCODE_FAIL_LOAD_UP) return False Logger.getlogging().info('remove uploadedfile' + file) FileUtility.remove(file) time.sleep(1) return True
def retrydownload(self, jsonfile, urlset): Logger.getlogging().warning( 'upload failed urls {num}'.format(num=len(urlset))) context = URLFileManager.getinstance().geturlfilecontext( FileUtility.getfilename(jsonfile)) if context.retry >= 2: Logger.getlogging().error('do not upload for failed again') for key in urlset.keys(): Logger.getlogging().error( 'download {url} failed'.format(url=key)) else: urls = [] for key in urlset.keys(): Logger.getlogging().warning( 'retry download {url}'.format(url=key)) for i in range(0, urlset[key]): urls.append(key) newurlfile = URLFileManager.getinstance().generateurlfilepath( context.retry + 1) Logger.getlogging().warning( 'Retry download URL {file}'.format(file=newurlfile)) if constant.POST_FILE_SUFFIX in jsonfile: URLManager.getinstance().storeurls(urls, constant.REQUEST_TYPE_POST) elif constant.WEBKIT_FILE_SUFFIX in jsonfile: URLManager.getinstance().storeurls( urls, constant.REQUEST_TYPE_WEBKIT) else: URLManager.getinstance().storeurls( urls, constant.REQUEST_TYPE_COMMON)
def renewfilename(self, file): """""" filename = FileUtility.getfilename(file) context = URLFileManager.getinstance().geturlfilecontext(filename) if not context: return False if self.filetime == int(time.time()): time.sleep(1) self.filetime = int(time.time()) newfilename = filename.replace( re.findall('\d+', filename)[-1], str(self.filetime)) urlsfile = self.tempurlpath + newfilename context.filename = urlsfile URLFileManager.getinstance().updateurlfilecontext( FileUtility.getfilename(urlsfile), context) return urlsfile
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] Logger.getlogging().debug('Get Valid PUC File From ' + self.download_path) #srclist = self.getvalidfiles(self.download_path) srclist = FileUtility.getfilelist(self.download_path, [])[0:self.maxfilenum] for donefile in srclist: try: if donefile.endswith('done'): Logger.getlogging().info('MOVE {file} TO {path}'.format( file=donefile, path=self.done_file)) FileUtility.move(donefile, self.done_file) binfile = os.path.join(self.done_file, FileUtility.getfilename(donefile)) #FileUtility.copy(donefile, self.cache_path) #binfile = self.cache_path+ FileUtility.getfilename(donefile) #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile): ##备份当天的puc文件 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday)) #FileUtility.move(donefile, self.pucbacktoday) #if FileUtility.exists(donefile): #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile)) #else: #Logger.getlogging().error('File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) try: self.s3puc_dumpurls(jsonfile) time.sleep(0.5) Logger.getlogging().debug( 'Remove {f}'.format(f=jsonfile)) FileUtility.remove(jsonfile) donefile2 = os.path.join( self.done_file, FileUtility.getfilename(donefile)) Logger.getlogging().debug( 'Remove {f}'.format(f=donefile2)) FileUtility.remove(donefile2) except: Logger.printexception() Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) except: Logger.printexception() return files
def __init__(self): self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {}, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}} self.url_beforenewsnum_map = {} self.url_curcmtcontent_map = {} self.url_curcmtnum_map = {} self.url_beforecmtnum_map = {} date = TimeUtility.getcurrentdate() path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date) suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX) self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date) if not FileUtility.exists(path): FileUtility.mkdirs(path)
def bin2json(self, file): filename = FileUtility.getfilename(file).replace('.done', '.json') cmd = PUCDownloader.PARSE_COMMAND.format(command=self.parse_tool, input=file, output=self.json_path, filename=filename) self.execute(cmd) return self.json_path + filename
def getall(self): resdict = None if FileUtility.exists(self.dbfile): database = bsddb.btopen(self.dbfile, 'r') resdict = {} for key in database.keys(): resdict[key] = database[key] database.close() return resdict