示例#1
0
 def aggregate_curcomments(self):
     #汇总本次未推送的评论
     sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                       content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT,
                       publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE,
                       key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1)
     cmtsresults = SQLDAO.getinstance().execute(sql, find=True)
     for cmtsresult in cmtsresults:
         urlmd5 = Common.md5(cmtsresult[0])
         content = self.strfilter(cmtsresult[1])
         publish = TimeUtility.getinttime(cmtsresult[2])
         if urlmd5 not in self.url_curcmtcontent_map:
             self.url_curcmtcontent_map[urlmd5] = []
         self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
示例#2
0
 def fileformat(self):
     self.aggregate_beforenewsinfo()
     self.aggregate_beforenewsnum()        
     self.aggregate_curcomments()
     self.aggregate_curcmtnum()
     self.aggregate_beforecmtsnum()
     self.dereplicate()
     urllist = []
     idlist = []
     newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond)
     for result in results:
         doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID]
         url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()            
         try:
             urlmd5 = Common.md5(url)
             channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201')
             title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '')
             body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '')
             commentlist = self.url_curcmtcontent_map.get(urlmd5, [])
             comments = ' '.join(commentlist)
             pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0))
             crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0))
             type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '')
             query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '')
             #评论量增量推送
             #      第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量
             #                     此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)                             
             #      第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)
             cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1)
             if cmtkey1flag <= 0:
                 cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1)
                 if cmtnum < 0:
                     cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             else:
                 cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             #其他增量
             clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1)
             clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum)
             votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1)
             votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum)
             fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1)
             fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum)
             string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel,
                                                            url=url,
                                                            title=self.strfilter(title),
                                                            body=self.strfilter(body),
                                                            comments=comments,
                                                            cmtnum=cmtnum,
                                                            clicknum=clicknum,
                                                            votenum=votenum,
                                                            fansnum=fansnum,
                                                            pubtime=TimeUtility.getinttime(pubtime),
                                                            crawlertime=crawlertime,
                                                            type=type,
                                                            query=self.strfilter(query))   
             Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8))
             if not title:
                 FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) 
             else:
                 FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8))  
       
             if id not in idlist:
                 idlist.append(id)
             if title and commentlist:
                 if url not in urllist:
                     urllist.append(url)
         except:
             Logger.getlogging().error(str(result))
             Logger.printexception()
     #已经提取过,则变更key1标记为1
     self.updatenewsflag(idlist)
     self.updatecommentsflag(urllist)
示例#3
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1
示例#4
0
 def updatedb(self):
     #此处注释请勿删除
     #wheref = '{key1}={val1} and \
     #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \
     #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))'
     #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
     #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
     #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
     #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
     #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
     #secs =self.period * 24*60*60
     #)
     where = {
         SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
         ETLController.LOCALMACHINEFLAG
     }
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where)
     colddata = []
     for result in results:
         data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         try:
             publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
             createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
             if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \
                (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60):
                 id = data[SQLDAO.SPIDER_TABLE_NEWS_ID]
                 colddata.append(result)
                 SQLDAO.getinstance().delete(
                     SQLDAO.SPIDER_TABLE_NEWS,
                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
         except:
             Logger.printexception()
             Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL],
                        constant.ERRORCODE_WARNNING_OTHERS)
     if colddata:
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     colddata,
                                     mutli=True)