예제 #1
0
    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
    def step3tt(self, params):
        try:
            jsondata = json.loads(params.content)
            if jsondata:
                publishlist = [
                    TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)
                ]
                try:
                    if jsondata == "ERROR_PARAMETER":
                        return
                    entitylist = jsondata['resultDO'].get('entityList', [])
                    for comment in entitylist:
                        content = self.strfilter(comment['body'])
                        #Jul 3, 2017 4:46:30 PM
                        curtime = comment['replyTime']
                        #此处时间格式
                        curtime = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.strptime(curtime, '%b %d, %Y %H:%M:%S %p'))
                        nick = comment['userName']
                        publishlist.append(curtime)
                        if not CMTStorage.exist(params.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content,
                                                curtime, nick)
                except:
                    Logger.printexception()
                    Logger.getlogging().error(
                        'extract no comment  from {site}'.format(
                            site=params.url))
                if not self.isnewesttime(params.originalurl, min(publishlist)):
                    return False
                return True

        except:
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
예제 #3
0
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is rayliComments.STEP_1:
                # 取得html中的commentType
                articleId = re.findall(
                    '^http://bbs\.rayli\.com\.cn/gallery-(\d+)-\d+.html',
                    proparam.url).__getitem__(0)

                #取得评论url
                comments_url = rayliComments.COMMENTS_URL % (articleId, 1)
                self.storeurl(comments_url, proparam.originalurl,
                              rayliComments.STEP_2, {
                                  'articleId': articleId,
                              })

            elif proparam.step == rayliComments.STEP_2:
                articleId = proparam.customized['articleId']
                # 取得评论个数
                comments_count = float(
                    re.findall(ur'回复:</span> (\d+)</div>',
                               proparam.content).__getitem__(0))
                if int(comments_count) == 0:
                    return

                # 判断增量
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                page_num = int(
                    math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    # 取得评论的url
                    url = rayliComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl,
                                  rayliComments.STEP_3)

            elif proparam.step == rayliComments.STEP_3:
                commentsInfo = []
                soup = BeautifulSoup(proparam.content, 'html.parser')
                # 获取评论
                comments = soup.select('.t_f')
                # 获取评论时间
                commentTime = self.r.parse(
                    ur'<em id="authorposton\d+">发表于 (.+?)</em>',
                    proparam.content)
                # 获取nick
                nicks = soup.select('.xw1')

                # 是否首页
                page = int(
                    self.r.parse(ur'page=1-page-(\d+)', proparam.url)[0])
                if page == 1:
                    index = 1
                else:
                    index = 0
                publishlist = [
                    TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)
                ]
                if len(comments) > 0:
                    # 获取评论
                    for index in range(index, len(comments), 1):
                        content = comments[index].text.strip()
                        curtime = commentTime[index]
                        nick = nicks[index].text
                        publishlist.append(curtime)
                        if not CMTStorage.exist(proparam.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(proparam.originalurl, content,
                                                curtime, nick)
                        # cmti = CommentInfo()
                        # if URLStorage.storeupdatetime(proparam.originalurl, commentTime[index]):
                        #    cmti.content = comments[index].text
                        #    commentsInfo.append(cmti)
                if len(publishlist) > 0:
                    publishdate = min(publishlist)
                    NewsStorage.setpublishdate(proparam.originalurl,
                                               publishdate)

                # # 保存获取的评论
                # if len(commentsInfo) > 0:
                #     self.commentstorage.store(proparam.originalurl, commentsInfo)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()