def setclick(self, params):
        try:
            content = json.loads(params.content)
            # content=[播放量,评论,X,X,弹幕,收藏数,投焦数,X]
            cmtnum = content[1]
            clicknum = content[0]
            votenum = content[-2]
            fansnum = content[-3]
            if not cmtnum:
                cmtnum = 0
            if not clicknum:
                clicknum = 0
            if not votenum:
                votenum = 0
            if not fansnum:
                fansnum = 0
            NewsStorage.seturlinfo(params.originalurl,
                                   data={
                                       SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: cmtnum,
                                       SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM:
                                       clicknum,
                                       SQLDAO.SPIDER_TABLE_NEWS_VOTENUM:
                                       votenum,
                                       SQLDAO.SPIDER_TABLE_NEWS_FANSNUM:
                                       fansnum
                                   })

        except:
            Logger.printexception()
Exemplo n.º 2
0
    def step1(self, params):
        try:
            Logger.getlogging().info("Kr36Comments.STEP_1")
            cid = self.r.parse('^http://36kr.com/p/(\d+)\.html', params.originalurl)[0]
            content = params.content

            page_content = content.split('<script>var props={"detailArticle|post":')[1].split(',"abTest|abtest":')[0]

            dump_content = eval(json.dumps(page_content))
            json_content = json.loads(dump_content)
            info_title = json_content["title"]
            info_content = json_content["content"]
            info_pubtime = TimeUtility.getformattime(json_content["published_at"])
            info_clicknum = json_content["counters"]["view_count"]
            info_cmtnum = json_content["counters"]["comment"]
            info_fansnum = json_content["counters"]["favorite"]
            info_votenum = json_content["counters"]["like"]
            # 去除HTML标签
            info_content = re.compile('</?\w+[^>]*>').sub('',info_content)
            if info_title:
                title = info_title
                # NewsStorage.settitle(params.originalurl,info_title)
            if info_content:
                body = info_content
                # NewsStorage.setbody(params.originalurl,info_content)
            if info_clicknum:
                clicknum = info_clicknum
                # NewsStorage.setclicknum(params.originalurl, info_clicknum)
            if info_pubtime:
                publishdate = info_pubtime
                # NewsStorage.setpublishdate(params.originalurl, info_pubtime)
            if info_cmtnum:
                cmtnum = info_cmtnum
                # NewsStorage.setcmtnum(params.originalurl, info_cmtnum)
            if info_fansnum:
                fansnum = info_fansnum
                # NewsStorage.setfansnum(params.originalurl, info_fansnum)
            if info_votenum:
                votenum = info_votenum
                # NewsStorage.setvotenum(params.originalurl, info_votenum)
            data = {"title": title, "clicknum": clicknum, "votenum": votenum, "fansnum": fansnum,
                    "publishdate": publishdate,"body":body,"cmtnum":cmtnum}
            NewsStorage.seturlinfo(params.originalurl,"","" ,data)

            # 根据输入原始url, 拼出评论首页
            commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, 1)
            self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_2,{'cid':cid})
        except:
            Logger.printexception()
 def getinfo(self, params):
     try:
         jsondata = json.loads(params.content)
         clicknum = jsondata['article']['readnum']
         votenum = jsondata['article']['praisenum']
         fansnum = jsondata['article']['favoritenum']
         publishtime = TimeUtility.getuniformtime(
             jsondata['article']['publishtime'])
         title = jsondata['article']['title']
         data = {}
         data = {
             "title": title,
             "clicknum": clicknum,
             "votenum": votenum,
             "fansnum": fansnum,
             "publishdate": publishtime
         }
         NewsStorage.seturlinfo(params.originalurl, '', '', data)
     except:
         Logger.printexception()
Exemplo n.º 4
0
 def wb_updatedb(self):
     self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
     for query in URLManager.waibustorage:
         self.conf.setquery(query)
         paramslist = URLManager.waibustorage[query]
         for params in paramslist:
             if params.type == constant.SPIDER_S2_WEBSITE_VIDEO:
                 data = {
                     SQLDAO.SPIDER_TABLE_NEWS_TITLE: params.title,
                     SQLDAO.SPIDER_TABLE_NEWS_BODY: params.body,
                     SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE: params.pubtime,
                     SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: params.clicknum
                 }
                 NewsStorage.seturlinfo(params.url, data=data)
             else:
                 data = {
                     SQLDAO.SPIDER_TABLE_NEWS_TITLE: params.title,
                     SQLDAO.SPIDER_TABLE_NEWS_BODY: params.body,
                     SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE: params.pubtime
                 }
                 NewsStorage.seturlinfo(params.url, data=data)