def process(self, params):
        """"""
        #过滤掉不在范围内的网站或url
        patterns = [
            '^http[s]{0,1}://(news|fashion|women|mil|health|cul|travel|history|learning|book|star.news|sports|(music\.)?yule|baobao|chihe|it|business|mgame)\.sohu\.com/.*',
            '^http[s]{0,1}://pic\.\w+\.sohu\.com/.*',
            '^http[s]{0,1}://gongyi\.sohu\.com/.*',
            '^http[s]{0,1}://pic\.book\.sohu\.com/.*',
            '^http[s]{0,1}://tv\.sohu\.com/.*',
            '^http[s]{0,1}://my\.tv\.sohu\.com/.*',
            '^http[s]{0,1}://www\.sohu\.com.*',
            '^http[s]{0,1}://p\.weather\.com\.cn.*'
        ]

        flag = False
        for pattern in patterns:
            if self.r.search(pattern, params.originalurl):
                flag = True
                break
        if not flag:
            Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
            return
        if self.r.search('https?://www\.sohu\.com/a/\d+_\d+',
                         params.originalurl):
            self.process_new_a(params)
        elif self.r.search(
                '^http[s]{0,1}://(www|news)\.sohu\.com.*|^http[s]{0,1}://p\.weather\.com\.cn.*',
                params.originalurl):
            ChangyanComments(self).process(params)
        else:
            self.process_video(params)
示例#2
0
 def step1(self, params):
     pattern = 'https://www.huxiu.com/article/(\d+).html'
     if not self.r.search(pattern, params.originalurl):
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
         return
     else:
         object_id = self.r.parse(pattern, params.originalurl)[0]
     curcmtnum = XPathUtility(
         params.content).getnumber('//*[@class="article-pl pull-left"]')
     if not curcmtnum:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     # 循环取得评论的url
     pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
     if pages >= self.maxpages:
         pages = self.maxpages
     for page in range(1, pages + 1):
         #self.POST_DATA['object_id'] = object_id
         #self.POST_DATA['page'] = page
         #self.storeposturl(self.POST_URL, params.originalurl, HuxiupostComments.EACH, self.POST_DATA)
         commonurl = self.COMMONURL.format(object_id=object_id, page=page)
         self.storeurl(commonurl, params.originalurl,
                       HuxiupostComments.EACH)
 def step3(self, params):
     """通过评论的url获取评论"""
     #相对之前的版本,本次更新变动:
     #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户
     #存储的内容增加了 评论发布时间, 用户
     try:
         jsondata = json.loads(params.content)
         if jsondata['comments']:
             for comment in jsondata['comments']:
                 content = comment['content']
                 curtime = TimeUtility.getuniformtime(
                     comment['create_time'])
                 nick = comment['passport']['nickname']
                 if not CMTStorage.exist(params.originalurl, content,
                                         curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content,
                                         curtime, nick)
                 reply = comment['comments']
                 while reply:
                     for comment in comment['comments']:
                         content = comment['content']
                         curtime = TimeUtility.getuniformtime(
                             comment['create_time'])
                         nick = comment['passport'].get(
                             'nickname', 'anonymous')
                         if not CMTStorage.exist(params.originalurl,
                                                 content, curtime, nick):
                             CMTStorage.storecmt(params.originalurl,
                                                 content, curtime, nick)
                         reply = comment['comments']
     except:
         Logger.printexception()
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
示例#4
0
 def baidutiebasearch_step2(self, params):
     # Step2: 根据返回内容,通过xpath: //*[@class="nums"] 得到最大总条数
     # 获取第一页的搜索结果
     self.baidutiebasearch_step3(params)        
     # 获取尾页page数
     xparser = XPathUtility(html=params.content)
     pager_search = xparser.xpath('//*[@class="pager pager-search"]')
     queryurl = ''
     if pager_search:
         tailpageurl = xparser.xpath('//*[@class="pager pager-search"]/a[last()]/@href')
         try:
             if tailpageurl:
                 lists = tailpageurl[0].split('pn=')
                 queryurl = 'http://tieba.baidu.com'+lists[0]
                 tailpage = int(lists[1])
                 if tailpage > BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE:
                     tailpage = BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE
                 if tailpage > self.maxpages:
                     tailpage = self.maxpages                
             else:
                 tailpage = 1                
         except:
             tailpage = 1
     else:
         # 没有检索结果,直接返回
         Logger.log(params.url, constant.ERRORCODE_EXCEPTTION_JSON)
         return
     if not queryurl:
         return
     # 根据上面的tailpage数,拼出除了第一页之外的所有的搜索结果url
     querylist = []
     for page in range(2, tailpage + 1, 1):
         url = queryurl + 'pn={page}'.format(page=page)
         querylist.append(url)
     self.__storeqeuryurllist__(querylist, BaiduTiebaS2Query2.BAIDU_TIEBA_SEARCH_EACH_PAGE)
示例#5
0
    def step2_ac(self, params):
        """只适用在腾讯动漫视频部分,获取评论的url列表"""
        url_id = params.customized['url_id']
        xhtml = etree.HTML(params.content)
        # 评论数量获取经常会参数错误
        comments_count = xhtml.xpath(
            '//*[@id="pagination-node"]/span/em/text()')
        if comments_count:
            comments_count = int(comments_count[0])
        else:
            Logger.log(params.originalurl,
                       constant.ERRORCODE_SITE_NOGET_COMMNETS)
            return
        page_size = len(xhtml.xpath('//*[@class="comment-content-detail"]'))
        # 判断增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        NewsStorage.setcmtnum(params.originalurl, comments_count)
        if cmtnum >= comments_count:
            return

        page_num = int(math.ceil((float(comments_count) / page_size)))
        if int(page_num) >= self.maxpages:
            page_num = self.maxpages
        for page in range(1, page_num + 1):
            url = self.AC_COMMENTS_URL.format(url_id, page)
            self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE)
    def step2(self, params):
	# 取得client_id
	liteloadApi  = params.customized['liteloadApi']
	client_id  = params.customized['client_id']
	topic_url  = params.customized['topic_url']
	commentsApi = params.customized['commentsApi']
	# 取得评论个数
	content = json.loads(params.content)
	curcmtnum = float(content.get('cmt_sum',0))
	NewsStorage.setcmtnum(params.originalurl, curcmtnum) 
	dbcmtnum = CMTStorage.getcount(params.originalurl, True)
	if dbcmtnum >= curcmtnum:
	    return
	# 取得topicId
	topic_id = content.get('topic_id','')
	if not topic_id:
	    Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
	    return
	# 循环取得评论的url
	pages = int(math.ceil(float(curcmtnum - dbcmtnum) / ChangyanComments.PAGE_SIZE))
	if pages >= self.maxpages:
	    pages = self.maxpages
	for page in range(1, pages + 1, 1):
	    # 取得评论的url
	    #COMMENTS_URL = 'http://changyan.sohu.com/api/{commentsApi}/topic/comments?client_id={client_id}&page_no={page_no}&page_size={page_size}&topic_id={topic_id}'	    
	    url = ChangyanComments.COMMENTS_URL.format(commentsApi=commentsApi,
	                                               client_id=client_id, 
	                                               page_no = page,
	                                               page_size = ChangyanComments.PAGE_SIZE,
	                                               topic_id=topic_id,
	                                               )
	    self.storeurl(url, params.originalurl, ChangyanComments.STEP_3)
示例#7
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
示例#8
0
 def gets2url(self, params):
     # 获取文本
     contents = json.loads(params.content)
     query = Common.urldec(params.customized['query'])
     urllist = []
     for item in contents['video_list']:
         try:
             vid = item['vid']
             if item.get('categoryName', '') == u"体育":
                 url = 'http://sports.le.com/video/{vid}.html'.format(
                     vid=vid)
             else:
                 url = 'http://www.le.com/ptv/vplay/{vid}.html'.format(
                     vid=vid)
             curtime = item['ctime']
             #print TimeUtility.getuniformtime(curtime)
             title = item['name']
             if self.compareNow(curtime):
                 if self.checktitle(query, title):
                     #Logger.getlogging().info(title)
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     # 获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
    def step2(self, params):
        """获取评论的其他url"""
        try:
            comments = json.loads(params.content)
            topic_id = comments['topic_id']
            curcmtnum = float(comments.get('cmt_sum', -1))
            #clicknum = float(comments.get('participation_sum',-1))
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
            #NewsStorage.setclicknum(params.originalurl, clicknum)

            dbcmtnum = CMTStorage.getcount(params.originalurl, True)
            if dbcmtnum >= curcmtnum:
                return
            page_num = int(
                math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
            if page_num >= self.maxpages:
                page_num = self.maxpages
            for page in range(1, page_num + 1):
                if self.r.search('http[s]{0,1}://.*tv\.sohu.com/.*',
                                 params.originalurl):
                    url = self.COMMENTS_URL.format(self.tv_client_id, topic_id,
                                                   page, self.tv_page_size)
                else:
                    url = self.COMMENTS_URL.format(self.client_id, topic_id,
                                                   page, self.page_size)
                self.storeurl(url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except:
            Logger.printexception()
            Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
 def step1(self, params):
     # 搜索页面单个视频
     info = params.customized['query']
     keyvalue = Common.trydecode(info)
     soup = BeautifulSoup(params.content, 'html5lib')
     page_numlist = soup.select('#sort > .page > a')
     if soup.select_one('.no-result'):
         Logger.log(params.originalurl,
                    constant.ERRORCODE_WARNNING_NORESULTS)
         return
     if page_numlist:
         page_num = int(page_numlist[-2].get_text())
     else:
         page_num = 1
     if page_num >= self.maxpages:
         page_num = self.maxpages
     querylist = []
     for page in range(1, page_num + 1):
         if page == 1:
             self.step2(params)
             continue
         url = S2Query.S2_URL.format(key=keyvalue, page=page)
         querylist.append(url)
     self.__storeqeuryurllist__(querylist, S2Query.STEP_2, {
         'query': info,
         'page_num': page_num
     })
    def step1(self, params):
        # 获得首页url参数
        info = params.customized['query']
        xparser = XPathUtility(params.content)
        if not xparser.xpath('//*[@class="mytopic topiclisttr"]'):
            Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
            return
        pageList = xparser.getcomments('//span[@class="right"]/a')
        if len(pageList) == 1:
            pageTotal = 1
        else:
            pageTotal = pageList[len(pageList) - 2]

        if int(pageTotal) >= self.maxpages:
            pageTotal = self.maxpages

        # 所有循环列表
        querylist = []

        # 根据总页数,获取query列表
        for page in range(1, int(pageTotal) + 1, 1):
            if page == 1:
                self.step2(params)
                continue
            url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page)
            querylist.append(url)
        self.__storeqeuryurllist__(querylist,
                                   hupuS2Query.HUPU_S2QUERY_EACH_PAGE,
                                   {'query': info})
示例#13
0
 def step2(self, params):
     jsondata = json.loads(params.content)
     if 'thread' not in jsondata:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     threadid = jsondata['thread']['thread_id']
     curcmtnum = int(jsondata['cursor']['total'])
     # 检查是否有评论数,没有,返回
     if int(curcmtnum) == 0:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     # 检查评论数是否增加,没有增加,返回;有增加,更新增加后的页面评论量
     curcmtnum = int(curcmtnum)
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     pages = int(
         math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE))
     if pages >= self.maxpages:
         pages = self.maxpages
     for page in range(1, pages + 1, 1):
         url = NarutomVideoComments.COMMENTS_URL.format(
             threadid=threadid,
             limit=NarutomVideoComments.DEFAULT_PAGE_SIZE,
             page=page)
         self.storeurl(url, params.originalurl, NarutomVideoComments.STEP_3)
示例#14
0
    def step2_ebook(self, params):
        try:
            #"""只适用在QQ阅读部分,获取评论的url列表"""
            bid = params.customized['bid']
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                Logger.log(params.originalurl,
                           constant.ERRORCODE_SITE_NOGET_COMMNETS)
                return
            comments_count = jsoncontent['data']['total']
            page_count = jsoncontent['data']['pageCount']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            if cmtnum >= comments_count:
                return

            # 判断10页
            if int(page_count) >= self.maxpages:
                page_count = self.maxpages

            for page in range(1, page_count + 1, 1):
                commentinfo_url = self.EBOOK_COMMENTS_URL.format(site='intro',
                                                                 bid=bid,
                                                                 page=page)
                self.storeurl(commentinfo_url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except Exception, e:
            Logger.printexception()
示例#15
0
 def baidutiebasearch_step3(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     post_list = soup.select('.s_post_list > .s_post')
     urllist = []
     for item in post_list:
         try:
             title = item.select_one('.p_title > a').get_text().strip()
             href = item.select_one('.p_title > a').get('href') 
             pubtimeobj = item.find(attrs={'class':'p_green p_date'})
             if not pubtimeobj:
                 Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
                 continue
             pubtime = pubtimeobj.get_text()
             pubtime = getuniformtime(pubtime)
             Logger.getlogging().debug(title)
             Logger.getlogging().debug(pubtime)
             if self.isyestoday(pubtime):
                 Logger.getlogging().debug('https://tieba.baidu.com'+href)
                 urllist.append('https://tieba.baidu.com'+href) 
             else:
                 Logger.log(params.url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)        
示例#16
0
 def process(self, params):
     # S2 Query Process
     if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel():
         if SPIDER_S2_WEBSITE_TYPE not in params.customized:
             return True
     xparser = XPathUtility(params.content)
     maxitmes = 0
     pageinfo = PageBasicInfo()
     template = None
     for template in TemplateManager.getxpaths(params.url):
         Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format(
             url=params.url,
             template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE]))
         pageinfo, items = self.parsefromcontent(params, template, xparser)
         if constant.SPIDER_S2_WEBSITE_TYPE in params.customized:
             pageinfo.type = params.customized[
                 constant.SPIDER_S2_WEBSITE_TYPE]
     #if not params.page_title and not pageinfo.title and not params.lastretry:
     #return False
     if template is None:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
     #值覆盖
     pageinfo.url = params.url
     if not pageinfo.title:
         pageinfo.title = params.page_title
     if not pageinfo.body:
         pageinfo.body = params.page_body
     if not pageinfo.pubtime:
         pageinfo.pubtime = params.html_time
     NewsStorage.seturlinfos(pageinfo)
示例#17
0
    def step2(self, params):
        keyword = params.customized['keyword']
        query = Common.urldec(keyword)
        jsondata = json.loads(params.content)
        # 获取分页数
        html = jsondata['html']
        soup = bs(html, 'html5lib')
        videoUrlList = []

        videoList = soup.select('li.video')
        for video in videoList:
            try:
                videoUrl = 'https:' + video.select_one('a').get('href')
                videoUrl = videoUrl.split('?')[0] + '/'
                title = video.select_one('a').get('title')
                pubtime = video.find(attrs={
                    'class': 'so-icon time'
                }).get_text().strip()
                if self.compareNow(TimeUtility.getuniformtime(pubtime)):
                    if self.checktitle(query, title):
                        videoUrlList.append(videoUrl)
                        self.__storeurl__(videoUrl, pubtime,
                                          SPIDER_S2_WEBSITE_VIDEO)
                    else:
                        Logger.log(videoUrl,
                                   constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                else:
                    Logger.log(videoUrl,
                               constant.ERRORCODE_WARNNING_NOMATCHTIME)
            except:
                Logger.printexception()
 def geturlplaycount(self, params):
     tvid = params.customized['tvid']
     if not self.r.search(tvid, params.content):
         Logger.log(params.url, constant.ERRORCODE_WARNNING_OTHERS)
         return
     playcount = self.r.getid(tvid, params.content)
     if playcount is not None:
         NewsStorage.setclicknum(params.originalurl, playcount)
 def step1(self, params):
     if self.r.search('var\swebcfg\s=\s\{\"id\":(\d+),', params.content):
         vod = self.r.parse('var\swebcfg\s=\s\{\"id\":(\d+),',
                            params.content)[0]
         comments_url = PptvComments.COMMENTS_URL_SUM % (vod, 0)
         self.storeurl(comments_url, params.originalurl,
                       PptvComments.STEP_2, {'vod': vod})
     else:
         Logger.log(params.originalurl, constant.ERRORCODE_WARNNING_OTHERS)
 def process(self,params):
     """"""
     if self.r.search('http[s]{0,1}://ff\.laohu\.com.*',params.originalurl):
         LaohuPostComments(self).process(params)
     elif self.r.search('http[s]{0,1}://bbs\.laohu\.com.*',params.originalurl):
         LaohuComments(self).process(params)
     else:
         Logger.getlogging().debug('{url}:40000  Not in task'.format(params.originalurl))
         Logger.log(params.originalurl,'40000')
         return
示例#21
0
 def step2(self, params):
     """获取评论的其他url,及评论"""
     #每次spider运行的累加数据tempcmttotal
     #
     try:
         url_id = params.customized['url_id']
         comment_id = params.customized['comment_id']
         before_update = params.customized['before_update']
         tempcmttotal = params.customized.get('tempcmttotal', 0)
         try:
             jsondata = json.loads(params.content)
             last = jsondata['data']['last']
             hasnext = jsondata['data']['hasnext']
             cmttotal = float(jsondata['data']['total'])
             NewsStorage.setcmtnum(params.originalurl, cmttotal)
         except:
             Logger.log(params.originalurl,
                        constant.ERRORCODE_SITE_NOGET_COMMNETS)
             return
         temptimes = []
         for comment in jsondata['data']['commentid']:
             tempcmttotal += 1
             content = comment['content']
             time = TimeUtility.getuniformtime(comment['time'])
             temptimes.append(time)
             user = comment['userinfo'].get('nick', 'anonymous')
             # 保存评论到数据库,可以通过接口exist判断评论是否已经存在
             CMTStorage.storecmt(params.originalurl, content, time, user)
         #对是否继续提取评论进行条件限制
         nextflag = True
         if temptimes:
             min_update = min(temptimes)
             max_update = max(temptimes)
             #发布时间临界点限制:最近两天
             #if max_update < self.cmtlastdays:
             #nextflag = False
             #发布时间限制:仅针对qq的评论提取策略,该评论的发布时间有序且依次递减
             if min_update < before_update:
                 nextflag = False
         #数量限制
         if tempcmttotal >= self.comment_maxnum:
             nextflag = False
         if float(tempcmttotal) / self.page_size > self.maxpages:
             nextflag = False
         if hasnext and nextflag:
             url = self.COMMENTS_URL.format(url_id, last, self.page_size)
             self.storeurl(
                 url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE, {
                     'url_id': url_id,
                     'comment_id': last,
                     'before_update': before_update,
                     'tempcmttotal': tempcmttotal
                 })
     except:
         Logger.printexception()
示例#22
0
 def step3(self, params):
     jsondata = json.loads(params.content)
     if 'parentPosts' not in jsondata:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     parentPosts = jsondata['parentPosts']
     for item in parentPosts:
         curtime = item['created_at']
         content = item['message']
         CMTStorage.storecmt(params.originalurl, content, curtime, '')
示例#23
0
 def getclick(self, params):
     print params.content.replace('\n', ' ').replace('\r', '')
     pattern1 = '<click>(\d+)</click>'
     pattern2 = '&lt;click&gt;(\d+)&lt;/click&gt;'
     if self.r.search(pattern1, params.content):
         click = self.r.parse(pattern1, params.content)[0]
         NewsStorage.setclicknum(params.originalurl, int(click))
     elif self.r.search(pattern2, params.content):
         click = self.r.parse(pattern2, params.content)[0]
         NewsStorage.setclicknum(params.originalurl, int(click))     
     else:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
    def step1(self, params):
	if re.search('http://.*\.sohu\.com/', params.originalurl):
	    cmttext = XPathUtility(params.content).getstring('//*[@class="c-num-red"][2]|//*[@id="changyan_parti_unit"]|//*[@class="remark-tit"]')
	    if cmttext:
		try:
		    cmtnum = re.findall('\d+', cmttext)[0]
		except:
		    cmtnum = -1
	    else:
		cmtnum = -1
	    #cmtnum = NewsStorage.getcmtnum(params.originalurl)
	    if int(cmtnum) == -1:
		pass
	    elif int(cmtnum) == 0:
		Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS)
		return
	else:
	    cmttext = XPathUtility(params.content).xpath('//*[@class="prompt-null-w"]')
	    if cmttext:
		Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS)
		return
	liteloadApi = ChangyanComments.liteloadApi
	commentsApi = ChangyanComments.commentsApi
	# 取得client_id
	if re.match('http://\w+\.sohu\.com.*',params.originalurl):
	    client_id = 'cyqemw6s1'
	elif re.match(r'^http://\w+\.(17173|shouyou|yeyou)\.com/.*',params.originalurl):
	    client_id = 'cyqvqDTV5'
	elif re.match(r'^http://sports\.le\.com/.*', params.originalurl):
	    client_id = 'cyrJ22d8v'
	#针对妆点网做特殊处理
	elif re.match(r'^http://\w+\.zdface\.com.*', params.originalurl):
	    client_id = 'cyrJOywnM'
	#http://xx.yzz.cn/xiuba/201609/1017135.shtml
	elif re.match(r'^http://\w+\.yzz\.cn.*', params.originalurl):
	    client_id = 'cyrtYf3sa'
	elif re.match(r'^http://\w+\.178\.com.*', params.originalurl):
	    client_id = 'cysrntF12'
	elif re.match(r'^http://.*\.cyol\.com/.*', params.originalurl):
	    client_id = 'cys3X3zo9'
	else:
	    client_id = self.r.getid('appid', params.content)
	topic_url = urllib.quote_plus(params.originalurl)
	#LITELOAD_URL = 'http://changyan.sohu.com/api/{liteloadApi}/topic/liteload?client_id={client_id}&topic_url={topic_url}&topic_source_id={topic_source_id}'
	topic_source_id = self.r.getid('sid',params.content)
	if not topic_source_id:
	    topic_source_id = self.r.getid('data-widget-sid', params.content)
	comment_url = ChangyanComments.LITELOAD_URL.format(liteloadApi=liteloadApi, client_id=client_id, topic_url=topic_url, topic_source_id=topic_source_id) 
	self.storeurl(comment_url, params.originalurl, ChangyanComments.STEP_2, {'client_id': client_id,
	                                                                         'liteloadApi':liteloadApi, 
	                                                                         'topic_url':topic_url, 
	                                                                         'commentsApi':commentsApi})	
 def process(self, params):
     # 1. 根据输入原始url, 获得子域名
     field = self.r.parse('^http[s]{0,1}://(\w+)\.baidu\.com.*',
                          params.originalurl)[0]
     if not field == 'tieba':
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
         return
     if params.step is BaiduTiebaComments.BAIDU_STEP1:
         self.getcomments_step1(params)
     elif params.step == BaiduTiebaComments.BAIDU_TIEBA_EACH_PAGE:
         self.getpagecomments_step2(params)
     elif params.step == BaiduTiebaComments.BAIDU_TIEBA_HUIFU_PAGE:
         self.get_comment_reply_step3(params)
    def process(self, params):
        if params.step == IqiyiS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回内容,通过xpath: //*[@data-search-page="item"] 得到最大page数、(返回数组的倒数第二位)
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            html = etree.HTML(params.content)
            nodes = html.xpath('//*[@data-search-page="item"]/text()')
            # 获取最后一页的页数(数组最后一项是下一页,倒数第二项是最后一页的页码数)
            if len(nodes) != 0:
                page_count = int(nodes[-2])
            else:
                page_count = 1
        
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            # http://so.iqiyi.com/so/q_Key_ctg__t_0_page_Page数_p_1_qc_0_rd_2_site_iqiyi_m_4_bitrate_?af=true
            querylist = []
            if page_count >= self.maxpages:
                page_count = self.maxpages
            for page in range(1, page_count + 1, 1):
                url = IqiyiS2Query.IQIYI_QUERY_TEMPLATE.format(key = keyvalue, pageno = page)
                querylist.append(url)
            self.__storeqeuryurllist__(querylist, IqiyiS2Query.S2QUERY_EACH_PAGE, {'query':info})

        elif params.step == IqiyiS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的url,通过xpath://*[@class="result_title"]/a/@href 获取搜索结果的url,把url写入文件
            query = params.customized['query']
            soup = BeautifulSoup(params.content, 'html5lib')
            divs = soup.select('.mod_result_list > .list_item')
            urllist = []
            for div in divs:
                try:
                    url = div.select_one('.result_title > a').get('href')
                    curtime = div.select_one('.result_info_desc')
                    # curtime = div.select_one('.result_info_desc').get_text()
                    if not curtime:
                        continue
                    else:
                        curtime = curtime.get_text()
                    title = div.select_one('.result_title > a').get_text().strip()
                    if self.compareNow(curtime):
                        if self.checktitle(query, title):
                            #Logger.getlogging().info(title)
                            urllist.append(url)
                        else:
                            Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)   
                except:
                    Logger.printexception()
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def getcomments_step1(self, params):
        try:
            tid = re.findall('/p/(\d+)', params.originalurl)
            if tid:
                tid = tid[0]
            else:
                return
            fid = self.r.getid('forum_id', params.content)

            soup = BeautifulSoup(params.content, "html5lib")
            body = soup.find(attrs={'id': re.compile('post_content')})

            if body:
                NewsStorage.setbody(params.originalurl, body.get_text())
            else:
                Logger.log(params.originalurl,
                           constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
            count = soup.select('.l_posts_num > .l_reply_num > span')
            if count:
                comment_count = count[0].get_text()
                page_num = count[1].get_text()
            else:
                comment_count = 0
                page_num = 1

            if int(page_num) > self.maxpages:
                page_num = self.maxpages
            # 拼接获取uniqid的url
            for page in range(1, int(page_num) + 1):
                flag = True
                if page == 1:
                    params.customized['page'] = 1
                    flag = self.getpagecomments_step2(params)
                if fid:
                    if not flag:
                        break
                    reply_url = BaiduTiebaComments.REPLY_URL.format(tid=tid,
                                                                    fid=fid,
                                                                    pn=page)
                    self.storeurl(reply_url, params.originalurl,
                                  BaiduTiebaComments.BAIDU_TIEBA_HUIFU_PAGE)
                if page == 1:
                    continue
                comment_url = BaiduTiebaComments.COMMENT_URL.format(tid=tid,
                                                                    page=page)
                self.storeurl(comment_url, params.originalurl,
                              BaiduTiebaComments.BAIDU_TIEBA_EACH_PAGE,
                              {'page': page})
        except:
            Logger.printexception()
示例#28
0
 def getnum(xparser, xpath, params, name):
     intvalue = -1
     if xpath.strip():
         strvalue = SiteBasicInfo.xpath(xparser, xpath)
         if strvalue:
             if name == TemplateManager.XPATH_KEY_COMMENTS_NUM or name == TemplateManager.XPATH_KEY_CLICK_NUM:
                 intvalue, count = SiteBasicInfo.str2cmtnum(strvalue, name)
             else:
                 intvalue, count = SiteBasicInfo.str2num(strvalue)
         SiteBasicInfo.printxpathinfo(params, name, xpath, strvalue,
                                      intvalue)
         if intvalue == -1:
             Logger.log(params.url,
                        constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
     return intvalue
示例#29
0
 def upload(self, upfiles):
     Logger.getlogging().debug('uploading ......')
     for file in upfiles:
         if self.emptyfile(file):
             Logger.getlogging().info('remove empty file: ' + file)
             FileUtility.remove(file)
             continue
         if not self.__upload__(file):
             Logger.log(FileUtility.getfilename(file),
                        constant.ERRORCODE_FAIL_LOAD_UP)
             return False
         Logger.getlogging().info('remove uploadedfile' + file)
         FileUtility.remove(file)
     time.sleep(1)
     return True
    def baidutiebasearch_step2(self, params):
        # Step2: 根据返回内容,通过xpath: //*[@class="nums"] 得到最大总条数
        #info = params.customized['query']
        #keyvalue = Common.urlenc(info)
        content = ''
        p = '<!--[\s\S]{0,}(<ul id="thread_list".*[\s\S]{0,})--></code><script>'
        if re.search(p, params.content):
            content = re.findall(p, params.content)[0]
        if not content:
            Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
            return
        # 获取第一页的搜索结果
        self.baidutiebasearch_step3(params)

        soup = BeautifulSoup(content, 'html5lib')
        queryurl = ''
        if soup.select('#thread_list'):
            try:
                if soup.select('#frs_list_pager'):
                    last = soup.select_one('#frs_list_pager > .last').get(
                        'href')
                    lists = last.split('pn=')
                    num = lists[1]
                    queryurl = 'https:' + lists[0]
                    tailpage = int(
                        num) / BaiduTiebaS2Query.DEFAULT_MAX_PAGESIZE + 1
                else:
                    tailpage = 1
            except:
                tailpage = 1
        else:
            # 没有检索结果,直接返回
            Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
            return
        if tailpage > BaiduTiebaS2Query.DEFAULT_MAX_PAGE:
            tailpage = BaiduTiebaS2Query.DEFAULT_MAX_PAGE
        if tailpage >= self.maxpages:
            tailpage = self.maxpages
        # 根据上面的tailpage数,拼出除了第一页之外的所有的搜索结果url
        querylist = []
        if not queryurl:
            return
        for page in range(2, tailpage + 1, 1):
            url = queryurl + 'pn={page}'.format(
                page=(page - 1) * BaiduTiebaS2Query.DEFAULT_MAX_PAGESIZE)
            querylist.append(url)
        self.__storeqeuryurllist__(
            querylist, BaiduTiebaS2Query.BAIDU_TIEBA_SEARCH_EACH_PAGE)