示例#1
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def getclick(self, params):
        pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html'
        if re.search(pattern, params.originalurl):
            if self.r.search(pattern, params.originalurl):
                vid = self.r.parse(pattern, params.originalurl)[0]
                playcount_url = self.PALYCOUNT_URL.format(vid=vid)
                self.storeurl(playcount_url, params.originalurl,
                              LeComments.STEP_PALY)

        if NewsStorage.getpublishdate(
                params.originalurl) == TimeUtility.getintformtime(0):
            if self.r.search('https?://sports\.le\.com/video/\d+\.html',
                             params.originalurl):
                #仅针对体育频道获取发布时间
                pubTime = XPathUtility(
                    params.content).getstring('//*[@class="live-vedio-infor"]')
                publishdate = TimeUtility.getuniformtime(publishdate)
                NewsStorage.setpublishdate(params.originalurl, publishdate)
            else:
                #仅针对综艺频道获取发布时间
                title = XPathUtility(params.content).getstring(
                    '//h1[@class="j-video-name video-name"]')
                if title:
                    if re.search('\d{8}', title):
                        publishdate = re.findall('\d{8}', title)[0]
                        NewsStorage.setpublishdate(params.originalurl,
                                                   publishdate)
 def step3(self, params):
     """通过评论的url获取评论"""
     #相对之前的版本,本次更新变动:
     #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户
     #存储的内容增加了 评论发布时间, 用户
     try:
         jsondata = json.loads(params.content)
         if jsondata['comments']:
             for comment in jsondata['comments']:
                 content = comment['content']
                 curtime = TimeUtility.getuniformtime(
                     comment['create_time'])
                 nick = comment['passport']['nickname']
                 if not CMTStorage.exist(params.originalurl, content,
                                         curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content,
                                         curtime, nick)
                 reply = comment['comments']
                 while reply:
                     for comment in comment['comments']:
                         content = comment['content']
                         curtime = TimeUtility.getuniformtime(
                             comment['create_time'])
                         nick = comment['passport'].get(
                             'nickname', 'anonymous')
                         if not CMTStorage.exist(params.originalurl,
                                                 content, curtime, nick):
                             CMTStorage.storecmt(params.originalurl,
                                                 content, curtime, nick)
                         reply = comment['comments']
     except:
         Logger.printexception()
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
示例#5
0
 def analysis(self, line, post=False):
     param = ProcessParam()
     js = json.loads(line)
     param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time'])
     param.url = Common.urldec(js['foundin'])
     param.content = js['html']
     if post:
         param.data = js['data']
     if js['html'][:3] == constant.GZIP_CODE:
         param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
     # decode
     content = Common.urldec(param.content)
     charset = RegexUtility.getid('charset', content)
     content = Common.trydecode(content, charset)
     param.content = content
     if 'property' in js:
         for property in js['property']:
             if not property.has_key('result'):
                 continue
             if property['property_name'] == u'page_body':
                 param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                    constant.CHARSET_GBK)
             elif property['property_name'] == u'page_title':
                 param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                     constant.CHARSET_GBK)
             elif property['property_name'] == u'html_time':
                 param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text'])
     return param
示例#6
0
    def waibuetl(self):
        waibubackup = SpiderConfigure.getwaibubaup()
        if not FileUtility.exists(waibubackup):
            FileUtility.mkdirs(waibubackup)

        waibufile = self.etl.getqueryfromdb()
        if not FileUtility.exists(waibufile):
            Logger.getlogging().warning(
                '{waibufile} not generate!'.format(waibufile=waibufile))
            return

        outtime = 0
        self.wdownloader.upload(waibufile)
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                Logger.getlogging().info(
                    'sleeping {sec}s......'.format(sec=self.waitingperiod))
                #time.sleep(self.waitingperiod)
                outtime += self.waitingperiod
                if self.wdownloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.wdownloader.download()
                    if downloadfiles:
                        break
                except:
                    Logger.printexception()
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.wb_analysis(dfile)
                #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)):
                #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile))
                FileUtility.move(dfile, waibubackup)
                logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
示例#7
0
 def __init__(self):
     self.database = SpiderDao()
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                        const.SPIDER_OUTPUT_FILENAME_SUFFIX)
     ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     self.difffile = '{path}/{dt}/{file}'.format(
         path=SpiderConfigure.getinstance().getconfig(
             const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH),
         dt=TimeUtility.getcurrentdate(),
         file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix,
                                                          ts=ts))
    def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
示例#9
0
    def step3(self, params):
        jsondata = json.loads(params.content)
        comments = []
        for comment in jsondata:
            cmti = CommentInfo()
            curcomtime = int(comment['created'])

            # 检查是否需要更新当前抓取的评论的最新时间,第一条评论时间就是最新评论时间
            if URLStorage.storeupdatetime(
                    params.originalurl,
                    TimeUtility.getuniformdate2(curcomtime)):
                cmti.content = comment['contents']
                comments.append(cmti)

                # 检查是否有评论回复
                if int(comment['comment_reply_total']) > 0:
                    reply = comment['reply']
                    # 获取所有的评论回复
                    for num in range(0, int(comment['comment_reply_total']),
                                     1):
                        recmti = CommentInfo()
                        recmti.content = reply[num]['contents']
                        comments.append(recmti)
        if len(comments) >= 0:
            # 保存获取的评论
            self.commentstorage.store(params.originalurl, comments)
 def getid(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     return Common.md5(
         Common.urlenc(url) + Common.urlenc(content) + pubdate +
         Common.urlenc(user))
示例#11
0
    def step3news(self, params):
        Logger.getlogging().info("ZolbbsComments.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="commli"]/p')
        commentstime = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="published-time"]')
        commentsnick = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="user-name"]')
        # 获取评论,设置实际的评论量
        for index in range(0, len(commentstime), 1):
            # 提取时间
            tm = commentstime[index].strip()
            try:
                curtime = TimeUtility.getuniformtime(getuniformtime(tm),
                                                     u'%Y-%m-%d %H:%M')
            except Exception, e:
                curtime = getuniformtime(tm)

            # 提取评论内容
            content = commentsinfo[index]
            nick = commentsnick[index]
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
示例#12
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is jiemianComments.STEP_1:
                # 取得url中的id
                articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0)
                # 设置clicknum
                self.setclick(proparam)
                # 取得评论个数
                comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0))
                if comments_count:
                    NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                # 取得评论件数
                if int(comments_count) == 0:
                    return

                # 增量判断
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = jiemianComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3)
            elif proparam.step == jiemianComments.STEP_3:
                # proparam.content = proparam.content.replace('\\','')
                # soup = BeautifulSoup(proparam.content, 'html5lib')
                # items = soup.select('.comment-post')
                # for item in items:
                #     content = item.select_one('.comment-main > p').get_text().encode('utf-8')
                #     curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text())
                #     nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8')
                # 取得点赞数
                votenum = self.r.getid('ding', proparam.content)
                if votenum == '':
                    Logger.getlogging().debug("Unable to get playcount")
                else:
                    NewsStorage.setvotenum(proparam.originalurl, votenum)
                # 取得评论的正则表达式
                comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content)
                ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content)
                nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content)

                # 取得评论
                for index in range(0,len(comments)):
                    time = ctime[index].replace('\\', '')
                    curtime = TimeUtility.getuniformtime(time)
                    content = eval('u"' + comments[index] + '"').encode('utf-8')
                    nick = eval('u"' + nicks[index] + '"').encode('utf-8')
                    if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)


        except Exception, e:
            traceback.print_exc()
示例#14
0
 def step3bbs(self, params):
     Logger.getlogging().info("JoyComments.STEP_3")
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     try:
         commentsinfo = json.loads(params.content)
         commentsinfo['result']['mainreplys']['rows']
     except:
         Logger.getlogging().warning(
             '{url} Errorcode:40000'.format(url=params.originalurl))
         Logger.printexception()
         return
     # 获取评论
     for index in range(
             0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1):
         # 提取时间
         # cmti = CommentInfo()
         content = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['reply']['body']['text']
         curtime = TimeUtility.getuniformtime(
             str(commentsinfo['result']['mainreplys']['rows'][index]
                 ['reply']['reply']['post_time']))
         nick = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['user']['name']
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
示例#15
0
 def __init__(self):
     self.factory = SiteFactory()
     self.conf = SpiderConfigure.getinstance()
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     self.period = int(SpiderConfigure.getinstance().getlastdays())
 def __init__(self):
     self.id = ''
     # query
     self.query = SpiderConfigure.getinstance().getquery()
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # 类型
     self.type = ''
     # URL
     self.url = ''
     # 标题
     self.title = ''
     # 正文 / 主贴
     self.body = ''
     # 评论(内容) / 回复(内容)
     # 评论量
     self.cmtnum = -1
     # 阅读量 / 播放量 增量
     self.clicknum = -1
     # 点赞量
     self.votenum = -1
     # 粉丝量 / 订阅量
     self.fansnum = -1
     # 发布时间
     self.pubtime = TimeUtility.getintformtime(0)
     # createtime
     self.createtime = SpiderConfigure.getinstance().starttime()
 def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
    def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
 def bbs_step3(self, params):
     try:
         xparser = XPathUtility(params.content)
         page = params.customized['page']
         pagecount = params.customized['pagecount']
         comments = []
         updatetimes = []
         nicks = []
         contents = xparser.getcomments('//*[@class="read"]')
         mid_times = xparser.getlist('//td[@class="authorname"]')
         for times in mid_times:
             updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0])
             nicks.append(self.r.parse(ur'(.*)于', times)[0])
         if page == 0:
             mid_index = 1
         elif page > 0:
             mid_index = 0
         comments_number = xparser.getnumber('//*[@id="msgsubject"]/font')
         if comments_number != 0:
             for index in range(mid_index, len(contents), 1):
                 curtime = TimeUtility.getuniformtime(updatetimes[index])
                 content = contents[index]
                 nick = nicks[index].split('于')[0].split('☆')[-1]
                 if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content, curtime, nick)
     except Exception, e:
         traceback.print_exc()
示例#20
0
    def step2(self, params):
        keyword = params.customized['keyword']
        query = Common.urldec(keyword)
        jsondata = json.loads(params.content)
        # 获取分页数
        html = jsondata['html']
        soup = bs(html, 'html5lib')
        videoUrlList = []

        videoList = soup.select('li.video')
        for video in videoList:
            try:
                videoUrl = 'https:' + video.select_one('a').get('href')
                videoUrl = videoUrl.split('?')[0] + '/'
                title = video.select_one('a').get('title')
                pubtime = video.find(attrs={
                    'class': 'so-icon time'
                }).get_text().strip()
                if self.compareNow(TimeUtility.getuniformtime(pubtime)):
                    if self.checktitle(query, title):
                        videoUrlList.append(videoUrl)
                        self.__storeurl__(videoUrl, pubtime,
                                          SPIDER_S2_WEBSITE_VIDEO)
                    else:
                        Logger.log(videoUrl,
                                   constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                else:
                    Logger.log(videoUrl,
                               constant.ERRORCODE_WARNNING_NOMATCHTIME)
            except:
                Logger.printexception()
    def geturlcomments(self, params):
        xparser = XPathUtility(params.content)
        # 取回所有评论
        page = params.customized['page']
        if page == 1:
            commentstimes = xparser.getcomments(
                '//table[position()>1]/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')
        else:
            commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')

        # 设置实际的评论量
        for index in range(0, len(commentscontents), 1):
            curtime = TimeUtility.getuniformtime(commentstimes[index][4:])
            # 提取评论内容
            content = commentscontents[index].strip()
            nick = commentsnicks[index].strip()
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
 def geturlcomments(self, proparam):
     # soup = BeautifulSoup(proparam.content, 'html5lib')
     # lis = soup.select('.comment-say')
     # for li in lis:
     #     content = li.select_one('.des').get_text()
     #     curtime = li.select_one('.time').get_text()
     #     nick = li.select_one('.name replyName').get_text()
     #     if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
     #         CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
     # 取得评论的正则表达式
     comments = re.findall(r'content":"(.+?)","paragraph_id"',
                           proparam.content)
     commentsTime = self.r.parse(
         r'origin_created":"(\d+)","member_avatarPath"', proparam.content)
     nicks = self.r.parse(r'"nickname":"(.*?)","is_hot"', proparam.content)
     # 取得评论
     index = 0
     for comment in comments:
         comment = eval('u"' + comment + '"')
         content = comment.encode('utf-8')
         curtime = TimeUtility.getuniformtime(commentsTime[index])
         nick = eval('u"' + nicks[index] + '"')
         nick = nick.encode('utf-8')
         if not CMTStorage.exist(proparam.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(proparam.originalurl, content, curtime,
                                 nick)
         index = index + 1
示例#23
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         return param
     except:
         line = line.replace('\n', '').strip()
         if not line or line[0] == '#':
             return
         Logger.getlogging().debug(line)
         param = ProcessParam()
         param.url = line
         if method == constant.REQUEST_TYPE_POST:
             js = json.loads(line)
             param.url = js['url']
             param.data = js['data']
         param.content = HttpCacher.getcontent(line, method)
         if param.content is None:
             return
         return param
示例#24
0
    def get_url_id(self, params):
        """只适用在腾讯视频的部分"""
        "cid是电视剧\合集\电影,vid单集"
        CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html'
        CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}'
        VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html'
        VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html'
        VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}'

        if self.r.search(CID_PATTERN, params.originalurl):
            cid = self.r.parse(CID_PATTERN, params.originalurl)[0]
            url = CID_URL.format(cid=cid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN1, params.originalurl):
            vid = self.r.parse(VID_PATTERN1, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN2, params.originalurl):
            vid = self.r.parse(VID_PATTERN2, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        #publish_date
        publish_date = self.r.getid('publish_date', params.content, split=':')
        if not publish_date:
            publish_date = XPathUtility(params.content).getstring(
                '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]'
            )
            publish_date = TimeUtility.getuniformtime(publish_date)
        if publish_date:
            NewsStorage.setpublishdate(params.originalurl, publish_date)
        self.setclick(params)
示例#25
0
    def step3_ebook(self, params):
        try:
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                return
            html = jsoncontent['data']['listHtml']
            if not html:
                return
            soup = BeautifulSoup(html, 'lxml')
            divs = soup.select('div.cf')
            if not divs:
                return
            for div in divs:
                # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2)
                content = div.select('dd > p')[1].get_text()

                curtime = TimeUtility.getuniformtime(
                    div.select('dd > p')[0].get_text().split('|')[-1])
                nick = div.select('dd > p')[0].get_text().split('|')[0]

                if not CMTStorage.exist(params.originalurl, content, curtime,
                                        nick):
                    CMTStorage.storecmt(params.originalurl, content, curtime,
                                        nick)

        except Exception, e:
            Logger.printexception()
示例#26
0
    def step2(self, params):
        """"""
        print params.content
        try:
            jsondata = json.loads(params.content)
            comments_total = int(jsondata['comments_total'])
            comments_data = jsondata['comments']
        except:
            Logger.getlogging().warning(
                '{url}:30000 No comments'.format(url=params.originalurl))
            return
        #cmtnum = URLStorage.getcmtnum(params.originalurl)
        #if cmtnum >= comments_total:
        #return
        #URLStorage.setcmtnum(params.originalurl, comments_total)

        comments = []
        for comment in comments_data:
            cmti = CommentInfo()
            cmti.content = comment['txtcontent']
            tm = comment['addtime']
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)
        if len(comments) > 0:
            # 保存获取的评论
            self.commentstorage.store(params.originalurl, comments)

        self.post_data['p'] = str(int(self.data['p'] + self.page_size))
        self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S')
        self.storeposturl(self.post_url, params.originalurl, self.STEP_2,
                          self.post_data)
示例#27
0
    def loop(self):
        # 循环URL,包括S1以及S2
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                # check time out
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                if self.downloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.downloader.download()
                    self.upload()
                    if len(downloadfiles) > 0:
                        break
                    else:
                        Logger.getlogging().info('sleeping {0}s......'.format(
                            self.waitingperiod))
                        time.sleep(self.waitingperiod)
                except:
                    Logger.printexception()

            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.processfile(dfile)
                logstring = 'PROCESSFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                self.upload()
示例#28
0
 def dmzjnews_step3(self, params):
     params.content = params.content[params.content.index('['):params.content.rindex(']') + 1]
     commentsinfo = json.loads(params.content)
     for index in range(0, len(commentsinfo), 1):
         # 提取时间
         content = commentsinfo[index]['content']
         curtime = TimeUtility.getuniformtime(commentsinfo[index]['create_time'])
         CMTStorage.storecmt(params.originalurl, content, curtime, '')
示例#29
0
    def process_book(self, params):
        try:
            if params.step == Comments.STEP_1:
                # 从url中获取拼接评论url的参数
                bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$',
                                      params.originalurl)[0]
                # 拼接第一页评论url
                comments_url = Comments.COMMENTS_URL % (bookId, 1,
                                                        Comments.PAGE_SIZE)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              Comments.STEP_2, {'bookId': bookId})

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == Comments.STEP_2:
                bookId = params.customized['bookId']
                # 获取评论的Jason返回值
                comments = json.loads(params.content)

                comments_count = int(comments['page']['count'])
                # 判断增量
                cmtnum = CMTStorage.getcount(params.originalurl)
                if cmtnum >= comments_count:
                    return
                NewsStorage.setcmtnum(params.originalurl, comments_count)
                # 获取评论最后更新时间
                lasttime = CMTStorage.getlastpublish(params.originalurl, True)
                # 获取评论页数
                page_count = int(comments['page']['pagecount'])
                if page_count == 0:
                    return

                if page_count >= self.maxpages:
                    page_count = self.maxpages

                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(1, page_count + 1, 1):
                    commentUrl = Comments.COMMENTS_URL % (bookId, page,
                                                          Comments.PAGE_SIZE)
                    self.storeurl(commentUrl, params.originalurl,
                                  Comments.STEP_3, {'bookId': bookId})

            #解析评论数据
            elif params.step == Comments.STEP_3:
                commentsinfo = json.loads(params.content)

                for comment in commentsinfo['page']['result']:
                    curtime = TimeUtility.getuniformtime(
                        comment['creationDate'])
                    content = comment['summary']
                    nick = comment['marks']['nikeName']
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)

        except Exception, e:
            traceback.print_exc()
示例#30
0
    def process(self, params):
        try:
            if params.step is AllComments.STEP_1:
                aid = re.findall("\d+", params.url.split("/")[-1])[0]
                aid_url = AllComments.AID_URL % (aid)
                self.storeurl(aid_url, params.originalurl, AllComments.STEP_2,
                              {'aid': aid})
            elif params.step is AllComments.STEP_2:
                cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",',
                                    str(params.content))[0]
                cms_url = AllComments.KEYID_URL % (
                    cms_id, params.customized['aid'], params.originalurl)
                self.storeurl(cms_url, params.originalurl, AllComments.STEP_3,
                              {
                                  'aid': params.customized['aid'],
                                  'cmsid': cms_id
                              })
            elif params.step is AllComments.STEP_3:
                comments = json.loads(params.content)
                sid = comments['data']['_id']
                comment_url = AllComments.COMMENTS_URL % (
                    sid, '1', params.customized['cmsid'])
                self.storeurl(comment_url, params.originalurl,
                              AllComments.STEP_4, {
                                  'sid': sid,
                                  'page': '1',
                                  'cmsid': params.customized['cmsid']
                              })
            elif params.step is AllComments.STEP_4:
                comments = json.loads(params.content)
                try:
                    comment = []
                    index = 0
                    for index in range(0, len(comments['data'])):
                        ctime = TimeUtility.getuniformtime2(
                            comments['data'][index]['ctime'])
                        if URLStorage.storeupdatetime(params.originalurl,
                                                      str(ctime)):
                            cmti = CommentInfo()
                            cmti.content = comments['data'][index]['content']
                            comment.append(cmti)
                    self.commentstorage.store(params.originalurl, comment)
                    comment_url = AllComments.COMMENTS_URL % (
                        params.customized['sid'],
                        str(int(params.customized['page']) + 1),
                        params.customized['cmsid'])
                    self.storeurl(
                        comment_url, params.originalurl, AllComments.STEP_4, {
                            'sid': params.customized['sid'],
                            'page': str(int(params.customized['page']) + 1),
                            'cmsid': params.customized['cmsid']
                        })
                except:
                    return

        except Exception, e:
            traceback.print_exc()
            Logger.getlogging().error(e.message)