def aggregate_curcomments(self): #汇总本次未推送的评论 sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT, publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1) cmtsresults = SQLDAO.getinstance().execute(sql, find=True) for cmtsresult in cmtsresults: urlmd5 = Common.md5(cmtsresult[0]) content = self.strfilter(cmtsresult[1]) publish = TimeUtility.getinttime(cmtsresult[2]) if urlmd5 not in self.url_curcmtcontent_map: self.url_curcmtcontent_map[urlmd5] = [] self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
def fileformat(self): self.aggregate_beforenewsinfo() self.aggregate_beforenewsnum() self.aggregate_curcomments() self.aggregate_curcmtnum() self.aggregate_beforecmtsnum() self.dereplicate() urllist = [] idlist = [] newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond) for result in results: doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID] url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() try: urlmd5 = Common.md5(url) channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201') title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '') body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '') commentlist = self.url_curcmtcontent_map.get(urlmd5, []) comments = ' '.join(commentlist) pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0)) crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0)) type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '') query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '') #评论量增量推送 # 第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量 # 此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) # 第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1) if cmtkey1flag <= 0: cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1) if cmtnum < 0: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) else: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) #其他增量 clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1) clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum) votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1) votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum) fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1) fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum) string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel, url=url, title=self.strfilter(title), body=self.strfilter(body), comments=comments, cmtnum=cmtnum, clicknum=clicknum, votenum=votenum, fansnum=fansnum, pubtime=TimeUtility.getinttime(pubtime), crawlertime=crawlertime, type=type, query=self.strfilter(query)) Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8)) if not title: FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) else: FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8)) if id not in idlist: idlist.append(id) if title and commentlist: if url not in urllist: urllist.append(url) except: Logger.getlogging().error(str(result)) Logger.printexception() #已经提取过,则变更key1标记为1 self.updatenewsflag(idlist) self.updatecommentsflag(urllist)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1
def updatedb(self): #此处注释请勿删除 #wheref = '{key1}={val1} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) colddata = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) try: publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60): id = data[SQLDAO.SPIDER_TABLE_NEWS_ID] colddata.append(result) SQLDAO.getinstance().delete( SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}) except: Logger.printexception() Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL], constant.ERRORCODE_WARNNING_OTHERS) if colddata: SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD, SQLDAO.SPIDER_TABLE_NEWS_KEYS, colddata, mutli=True)