def show():
     u'{channel}\t{query}\t{cmtnum}\t{clicknum}\t{fansnum}\t{votenum}\t{publishdate}\t{createdate}\t{url}'
     Logger.getlogging().debug(
         'Now, Results Extract From Database Showing: ')
     Logger.getlogging().debug(
         u'channel\tquery\tcmtnum\tclicknum\tfansnum\tvotenum\tpublishdate\tcreatedate\turl'
     )
     alldata = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS, {
             SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         })
     for data in alldata:
         dictdata = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, data)
         string = NewsStorage.NEWS_FORMAT.format(
             channel=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL],
             query=dictdata[SQLDAO.SPIDER_TABLE_NEWS_QUERY],
             cmtnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM],
             clicknum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM],
             fansnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM],
             votenum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM],
             publishdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE],
             createdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE],
             url=dictdata[SQLDAO.SPIDER_TABLE_NEWS_URL])
         Logger.getlogging().debug(string)
Пример #2
0
 def storetiebaquery(self,
                     query,
                     queryurl,
                     machineflaglist=MACHINEFLAGLIST_TIEBA):
     #查询query是否存在,如果存在则更新当前updatetime
     #                  如果不存在则查找具有query数量最小的机器,进行query存储
     query = query.strip()
     queryurl = queryurl.strip()
     result = QueryStorage.find(query,
                                machineflaglist,
                                table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA)
     if result:
         resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                         result)
         machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
         id = QueryStorage.getid(query, machine)
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                 SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                 SpiderConfigure.getinstance().starttime(),
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                 1
             })
     else:
         machine = min(self.querystorage_tieba.iteritems(),
                       key=lambda x: x[1])[0]
         data = {
             SQLDAO.SPIDER_TABLE_QUERYS_ID:
             QueryStorage.getid(query, machine),
             SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
             query,
             SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
             machine,
             SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL:
             queryurl,
             SQLDAO.SPIDER_TABLE_QUERYS_VALID:
             1
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
     #对各machine的实时存储记录
     self.querystorage_tieba[machine] = self.querystorage_tieba.get(
         machine, 0) + 1
Пример #3
0
 def updatedb(self):
     #此处注释请勿删除
     #wheref = '{key1}={val1} and \
     #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \
     #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))'
     #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
     #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
     #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
     #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
     #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
     #secs =self.period * 24*60*60
     #)
     where = {
         SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
         ETLController.LOCALMACHINEFLAG
     }
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where)
     colddata = []
     for result in results:
         data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         try:
             publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
             createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
             if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \
                (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60):
                 id = data[SQLDAO.SPIDER_TABLE_NEWS_ID]
                 colddata.append(result)
                 SQLDAO.getinstance().delete(
                     SQLDAO.SPIDER_TABLE_NEWS,
                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
         except:
             Logger.printexception()
             Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL],
                        constant.ERRORCODE_WARNNING_OTHERS)
     if colddata:
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     colddata,
                                     mutli=True)
Пример #4
0
 def fileformat(self):
     self.aggregate_beforenewsinfo()
     self.aggregate_beforenewsnum()        
     self.aggregate_curcomments()
     self.aggregate_curcmtnum()
     self.aggregate_beforecmtsnum()
     self.dereplicate()
     urllist = []
     idlist = []
     newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond)
     for result in results:
         doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID]
         url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()            
         try:
             urlmd5 = Common.md5(url)
             channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201')
             title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '')
             body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '')
             commentlist = self.url_curcmtcontent_map.get(urlmd5, [])
             comments = ' '.join(commentlist)
             pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0))
             crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0))
             type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '')
             query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '')
             #评论量增量推送
             #      第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量
             #                     此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)                             
             #      第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)
             cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1)
             if cmtkey1flag <= 0:
                 cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1)
                 if cmtnum < 0:
                     cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             else:
                 cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             #其他增量
             clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1)
             clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum)
             votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1)
             votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum)
             fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1)
             fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum)
             string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel,
                                                            url=url,
                                                            title=self.strfilter(title),
                                                            body=self.strfilter(body),
                                                            comments=comments,
                                                            cmtnum=cmtnum,
                                                            clicknum=clicknum,
                                                            votenum=votenum,
                                                            fansnum=fansnum,
                                                            pubtime=TimeUtility.getinttime(pubtime),
                                                            crawlertime=crawlertime,
                                                            type=type,
                                                            query=self.strfilter(query))   
             Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8))
             if not title:
                 FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) 
             else:
                 FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8))  
       
             if id not in idlist:
                 idlist.append(id)
             if title and commentlist:
                 if url not in urllist:
                     urllist.append(url)
         except:
             Logger.getlogging().error(str(result))
             Logger.printexception()
     #已经提取过,则变更key1标记为1
     self.updatenewsflag(idlist)
     self.updatecommentsflag(urllist)
Пример #5
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1
 def getdoc(url):
     value = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS,
         {SQLDAO.SPIDER_TABLE_NEWS_ID: NewsStorage.getid(url)},
         multi=False)
     return SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, value)