def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
    def getclick(self, params):
        pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html'
        if re.search(pattern, params.originalurl):
            if self.r.search(pattern, params.originalurl):
                vid = self.r.parse(pattern, params.originalurl)[0]
                playcount_url = self.PALYCOUNT_URL.format(vid=vid)
                self.storeurl(playcount_url, params.originalurl,
                              LeComments.STEP_PALY)

        if NewsStorage.getpublishdate(
                params.originalurl) == TimeUtility.getintformtime(0):
            if self.r.search('https?://sports\.le\.com/video/\d+\.html',
                             params.originalurl):
                #仅针对体育频道获取发布时间
                pubTime = XPathUtility(
                    params.content).getstring('//*[@class="live-vedio-infor"]')
                publishdate = TimeUtility.getuniformtime(publishdate)
                NewsStorage.setpublishdate(params.originalurl, publishdate)
            else:
                #仅针对综艺频道获取发布时间
                title = XPathUtility(params.content).getstring(
                    '//h1[@class="j-video-name video-name"]')
                if title:
                    if re.search('\d{8}', title):
                        publishdate = re.findall('\d{8}', title)[0]
                        NewsStorage.setpublishdate(params.originalurl,
                                                   publishdate)
 def __init__(self):
     self.id = ''
     # query
     self.query = SpiderConfigure.getinstance().getquery()
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # 类型
     self.type = ''
     # URL
     self.url = ''
     # 标题
     self.title = ''
     # 正文 / 主贴
     self.body = ''
     # 评论(内容) / 回复(内容)
     # 评论量
     self.cmtnum = -1
     # 阅读量 / 播放量 增量
     self.clicknum = -1
     # 点赞量
     self.votenum = -1
     # 粉丝量 / 订阅量
     self.fansnum = -1
     # 发布时间
     self.pubtime = TimeUtility.getintformtime(0)
     # createtime
     self.createtime = SpiderConfigure.getinstance().starttime()
Пример #4
0
 def updatedb(self):
     #此处注释请勿删除
     #wheref = '{key1}={val1} and \
     #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \
     #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))'
     #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
     #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
     #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
     #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
     #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
     #secs =self.period * 24*60*60
     #)
     where = {
         SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
         ETLController.LOCALMACHINEFLAG
     }
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where)
     colddata = []
     for result in results:
         data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         try:
             publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
             createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
             if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \
                (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60):
                 id = data[SQLDAO.SPIDER_TABLE_NEWS_ID]
                 colddata.append(result)
                 SQLDAO.getinstance().delete(
                     SQLDAO.SPIDER_TABLE_NEWS,
                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
         except:
             Logger.printexception()
             Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL],
                        constant.ERRORCODE_WARNNING_OTHERS)
     if colddata:
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     colddata,
                                     mutli=True)
 def getlastpublish(url, before=True):
     if not before:
         wheref = '{urlkey}=\"{url}\"'
         where = wheref.format(urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                               url=url)
     else:
         wheref = '{urlkey}=\"{url}\" and {datekey} < {date}'
         where = wheref.format(
             urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
             url=url,
             datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE,
             date=SpiderConfigure.getinstance().starttime())
     sqlf = 'SELECT MAX({key}) FROM {table} WHERE {where}'
     sql = sqlf.format(key=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE,
                       table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       where=where)
     results = SQLDAO.getinstance().execute(sql, find=True)
     if results[0][0]:
         return results[0][0]
     return TimeUtility.getintformtime(0)
Пример #6
0
 def fileformat(self):
     self.aggregate_beforenewsinfo()
     self.aggregate_beforenewsnum()        
     self.aggregate_curcomments()
     self.aggregate_curcmtnum()
     self.aggregate_beforecmtsnum()
     self.dereplicate()
     urllist = []
     idlist = []
     newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond)
     for result in results:
         doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID]
         url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()            
         try:
             urlmd5 = Common.md5(url)
             channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201')
             title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '')
             body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '')
             commentlist = self.url_curcmtcontent_map.get(urlmd5, [])
             comments = ' '.join(commentlist)
             pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0))
             crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0))
             type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '')
             query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '')
             #评论量增量推送
             #      第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量
             #                     此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)                             
             #      第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)
             cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1)
             if cmtkey1flag <= 0:
                 cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1)
                 if cmtnum < 0:
                     cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             else:
                 cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             #其他增量
             clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1)
             clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum)
             votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1)
             votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum)
             fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1)
             fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum)
             string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel,
                                                            url=url,
                                                            title=self.strfilter(title),
                                                            body=self.strfilter(body),
                                                            comments=comments,
                                                            cmtnum=cmtnum,
                                                            clicknum=clicknum,
                                                            votenum=votenum,
                                                            fansnum=fansnum,
                                                            pubtime=TimeUtility.getinttime(pubtime),
                                                            crawlertime=crawlertime,
                                                            type=type,
                                                            query=self.strfilter(query))   
             Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8))
             if not title:
                 FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) 
             else:
                 FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8))  
       
             if id not in idlist:
                 idlist.append(id)
             if title and commentlist:
                 if url not in urllist:
                     urllist.append(url)
         except:
             Logger.getlogging().error(str(result))
             Logger.printexception()
     #已经提取过,则变更key1标记为1
     self.updatenewsflag(idlist)
     self.updatecommentsflag(urllist)
Пример #7
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1