def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def waibuetl(self): waibubackup = SpiderConfigure.getwaibubaup() if not FileUtility.exists(waibubackup): FileUtility.mkdirs(waibubackup) waibufile = self.etl.getqueryfromdb() if not FileUtility.exists(waibufile): Logger.getlogging().warning( '{waibufile} not generate!'.format(waibufile=waibufile)) return outtime = 0 self.wdownloader.upload(waibufile) continueflag = True while continueflag: downloadfiles = [] while True: Logger.getlogging().info( 'sleeping {sec}s......'.format(sec=self.waitingperiod)) #time.sleep(self.waitingperiod) outtime += self.waitingperiod if self.wdownloader.iscompleted(): continueflag = False break try: downloadfiles = self.wdownloader.download() if downloadfiles: break except: Logger.printexception() if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.wb_analysis(dfile) #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)): #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile)) FileUtility.move(dfile, waibubackup) logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break
def __download__(self, downloaderlist): valid_json_files = [] for impl in downloaderlist: json_files = impl.download() for dfile in json_files: for ufile in self.upload_file_list.keys(): if RegexUtility.match( Downloader.DOWNLOAD_FORMAT1.format(file=ufile), dfile): self.upload_file_list.pop(ufile) if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) elif RegexUtility.match( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile): value = RegexUtility.parse( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile)[0] if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) if value[0] == value[1]: self.upload_file_list.pop(ufile) retransmissionfiles = impl.outtimefiles() for fl in retransmissionfiles.keys(): # 下载异常 if fl not in self.all_retransmissionfiles: self.all_retransmissionfiles[fl] = retransmissionfiles[fl] self.all_retransmissionfiles[fl].retrans_num += 1 self.all_retransmissionfiles[fl].taskinfo = impl self.retransmissionfiles[fl] = self.all_retransmissionfiles[fl] if self.retransmissionfiles[ fl].retrans_num <= self.retransmissionlimitnum: # 虽然下载失败了,但假装已下载,故在upload_file_list删除 self.upload_file_list.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:{num}th fail'.format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num)) else: # 虽然下载失败了,但假装已下载,故在upload_file_list删除;不再重传,在重传列表中删除 self.upload_file_list.pop(fl) self.retransmissionfiles.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:more then {num}th fail'. format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num - 1)) return valid_json_files
def download(self): files = [] if self.completed(): return files Logger.getlogging().debug(self.info.donepath) srclist = self.sshls(self.info.donepath) for donefile in srclist: donefile = donefile.strip() filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.mkdirs(self.info.localdonepath) self.sshdownload(donefile) dfile = self.info.localdonepath + FileUtility.getfilename( donefile) if self.info.jsonpath: dfile = self.bin2json(dfile) files.append(dfile) self.download_time = int(time.time()) self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(dfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=dfile)) break return files
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def getall(self): resdict = None if FileUtility.exists(self.dbfile): database = bsddb.btopen(self.dbfile, 'r') resdict = {} for key in database.keys(): resdict[key] = database[key] database.close() return resdict
def getqueryfromdb(self): #指定s2 query输出文件路径 s2file = SpiderConfigure.getinstance().gets2file() temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) QueryStorage.getinstance().getlocalquerys( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def removecachefile(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) if FileUtility.exists(cache): FileUtility.move(cache, databackupfolder) FileUtility.rmdir(cache) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) if FileUtility.exists(databackuppath): validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000') for s in os.listdir(databackuppath): fullpath = os.path.join(databackuppath, s) #Logger.getlogging().info('remove cach folder ' + fullpath) #FileUtility.rmdir(fullpath) if s < validdate: fullpath = os.path.join(databackuppath, s) Logger.getlogging().info('remove cach folder ' + fullpath) FileUtility.rmdir(fullpath)
def gettiebaqueryfromdb(self): #指定s2 query输出文件路径 tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) temppath = Storage.getstoragelocation( const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile) QueryStorage.getinstance().getlocalquerys_tieba( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def storagequery(self): QueryStorage.updatedb() SpiderConfigure.getinstance().setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() if FileUtility.exists(s2file): lines = FileUtility.readlines(s2file) for strquery in lines: QueryStorage.getinstance().storequery(strquery) QueryStorage.getinstance().storewaibuquery(strquery) tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) if FileUtility.exists(tiebafile): lines = FileUtility.readlines(tiebafile) for strquery in lines: if not self.checks3query(strquery): continue query = strquery.split('\t')[0].strip() url = strquery.split('\t')[1].strip() QueryStorage.getinstance().storetiebaquery(query, url)
def getvalue(self, key): value = None if key in SpiderDao.cachedict: value = SpiderDao.cachedict[key] elif FileUtility.exists(self.dbfile): database = bsddb.btopen(self.dbfile, 'r') if database.has_key(key): value = database[key] SpiderDao.cachedict[key] = value database.close() return value
def recoverfile(self, filename): """""" # 查找,获取backup路径,再恢复到目的目录./data/temp/urls filelist = FileUtility.getfilelist(self.urlbackuppath, []) tempfilepath = os.path.join(self.urlbackuppath, filename) if tempfilepath in filelist: newfilepath = self.renewfilename(tempfilepath) FileUtility.copy(tempfilepath, newfilepath) time.sleep(0.5) if FileUtility.exists(newfilepath): return newfilepath return False
def s2upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) for line in lines: try: query = line.strip() self.conf.setchannel(SPIDER_CHANNEL_S2) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() allsite = self.factory.getall() for site in allsite: site.s2query(query) except: Logger.printexception()
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: rows = [] for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning( 'Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 rows.append(line) if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if rows: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def backupfile(self, jsonfile): urlmap = {} splitkey = '.' if '_split' in jsonfile: splitkey = '_split' bkfile = self.urlbackuppath + '/' + FileUtility.getfilename(jsonfile).split(splitkey)[0] if FileUtility.exists(bkfile): with open(bkfile, 'r') as bkfh: for line in bkfh.readlines(): line = line.strip() if line in urlmap: urlmap[line] += 1 else: urlmap[line] = 1 return urlmap
def __init__(self): self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {}, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}} self.url_beforenewsnum_map = {} self.url_curcmtcontent_map = {} self.url_curcmtnum_map = {} self.url_beforecmtnum_map = {} date = TimeUtility.getcurrentdate() path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date) suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX) self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date) if not FileUtility.exists(path): FileUtility.mkdirs(path)
def mkcachedir(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) FileUtility.rmdir(cache) FileUtility.mkdirs(cache) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH)) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH) if FileUtility.exists(outputpath): validdate = TimeUtility.getuniformdatebefore(limit) for s in os.listdir(outputpath): if s < validdate: fullpath = os.path.join(outputpath, s) FileUtility.rmdir(fullpath)
def s1upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) self.conf.setchannel(SPIDER_CHANNEL_S1) self.conf.setquery('') URLFileManager.getinstance().generateurlfilepath() for line in lines: try: url = line.strip() params = PageBasicInfo() params.url = url #NewsStorage.seturlinfos(params) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) except: Logger.printexception()
def loadfile(self, tablename, filepath, retrycount=0): try: self.delete(tablename, {}, False) jsonlist = [] if not FileUtility.exists(filepath): return False with open(filepath, 'r') as fp: for line in fp.readlines(): if not line.strip(): continue jsonlist.append(json.loads(line.strip())) if not jsonlist: return False self.insert(tablename, jsonlist) return True except: if retrycount == self.MAX_RETRY_TIMES: Logger.printexception() return False retrycount += 1 return self.loadfile(tablename, filepath, retrycount)
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] if self.completed(): return files Logger.getlogging().debug(self.download_path) srclist = FileUtility.getfilelist(self.download_path, []) for donefile in srclist: filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) self.download_time = time.time() for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.copy(donefile, self.cache_path) binfile = self.cache_path + FileUtility.getfilename( donefile) if FileUtility.getfilesize( donefile) == FileUtility.getfilesize(binfile): Logger.getlogging().info( 'Remove {file}'.format(file=donefile)) FileUtility.remove(donefile) if FileUtility.exists(donefile): Logger.getlogging().error( 'Remove {file} failed'.format( file=donefile)) else: Logger.getlogging().error( 'File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) uploadtime = self.uploadfile_retranslist[ upfile].start_time if RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT1.format( file=upfile), filename): self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) elif RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT2.format( file=upfile), filename): value = \ RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0] if value[0] == value[1]: self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(jsonfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) # update upload time keys = self.sortkeys() for fl in keys: if self.uploadfile_retranslist[ fl].start_time >= uploadtime: self.uploadfile_retranslist[ fl].start_time = time.time() time.sleep(0.1) break return files
def remove(self, keys): if FileUtility.exists(self.dbfile): database = bsddb.btopen(self.dbfile, 'w') for key in keys: database.__delitem__(key) database.close()
def show(self): diffinfolist = {} predict = self.database.getall() instances = URLStorage.getinstances() Logger.getlogging().info( '##############################################################################################' ) Logger.getlogging().info( '%8s|%8s|%8s|%8s|%8s|%8s|%8s|%20s|%16s' % ('key', 'flag', 'cmtnum', 'clicknum', 'votenum', 'fansnum', 'realnum', 'pubtime', 'timestamp')) for ins in instances.keys(): diffinfolist[ins] = DiffInfomation() if ins != constant.SPIDER_CHANNEL_S1: diffinfolist[ins].channel = constant.SPIDER_CHANNEL_S2 diffinfolist[ins].query = ins for key in instances[ins].urlinfodict: if instances[ins].urlinfodict[key].realnum > 0: StatisticsManager.updategotcomments(1) elif instances[ins].urlinfodict[key].cmtnum > 0: StatisticsManager.updatefailgotcomment(1) if predict and key in predict: info = URLCommentInfo.fromstring(predict[key]) if not instances[ins].urlinfodict[key].isequal(info): self.printinfo(ins, info, '-') self.printinfo(ins, instances[ins].urlinfodict[key], '+') if instances[ins].urlinfodict[key].cmtnum > 0: diffinfolist[ins].deltacmt += self.diff( instances[ins].urlinfodict[key].cmtnum, info.cmtnum) else: diffinfolist[ins].deltacmt += self.diff( instances[ins].urlinfodict[key].realnum, info.realnum) diffinfolist[ins].deltaclick += self.diff( instances[ins].urlinfodict[key].clicknum, info.clicknum) diffinfolist[ins].deltavote += self.diff( instances[ins].urlinfodict[key].votenum, info.votenum) diffinfolist[ins].deltafans += self.diff( instances[ins].urlinfodict[key].fansnum, info.fansnum) else: self.printinfo(ins, instances[ins].urlinfodict[key], '+') if instances[ins].urlinfodict[key].cmtnum > 0: diffinfolist[ins].deltacmt += instances[ ins].urlinfodict[key].cmtnum else: diffinfolist[ins].deltacmt += max( 0, instances[ins].urlinfodict[key].realnum) diffinfolist[ins].deltaclick += max( 0, instances[ins].urlinfodict[key].clicknum) diffinfolist[ins].deltavote += max( 0, instances[ins].urlinfodict[key].votenum) diffinfolist[ins].deltafans += max( 0, instances[ins].urlinfodict[key].fansnum) Logger.getlogging().info( '##############################################################################################' ) if FileUtility.exists(self.difffile): FileUtility.remove(self.difffile) for key in diffinfolist.keys(): Logger.getlogging().info(diffinfolist[key].tostring()) FileUtility.writeline(self.difffile, diffinfolist[key].tostring())