Пример #1
0
    def step2_ebook(self, params):
        try:
            #"""只适用在QQ阅读部分,获取评论的url列表"""
            bid = params.customized['bid']
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                Logger.log(params.originalurl,
                           constant.ERRORCODE_SITE_NOGET_COMMNETS)
                return
            comments_count = jsoncontent['data']['total']
            page_count = jsoncontent['data']['pageCount']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            if cmtnum >= comments_count:
                return

            # 判断10页
            if int(page_count) >= self.maxpages:
                page_count = self.maxpages

            for page in range(1, page_count + 1, 1):
                commentinfo_url = self.EBOOK_COMMENTS_URL.format(site='intro',
                                                                 bid=bid,
                                                                 page=page)
                self.storeurl(commentinfo_url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except Exception, e:
            Logger.printexception()
Пример #2
0
    def step3_ebook(self, params):
        try:
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                return
            html = jsoncontent['data']['listHtml']
            if not html:
                return
            soup = BeautifulSoup(html, 'lxml')
            divs = soup.select('div.cf')
            if not divs:
                return
            for div in divs:
                # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2)
                content = div.select('dd > p')[1].get_text()

                curtime = TimeUtility.getuniformtime(
                    div.select('dd > p')[0].get_text().split('|')[-1])
                nick = div.select('dd > p')[0].get_text().split('|')[0]

                if not CMTStorage.exist(params.originalurl, content, curtime,
                                        nick):
                    CMTStorage.storecmt(params.originalurl, content, curtime,
                                        nick)

        except Exception, e:
            Logger.printexception()
Пример #3
0
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is KanKanComments.STEP_1:
                # 获取播放量(不是所有的视频都有播放量)
                self.setclicknum(params)
                if self.r.match(self.TYPE1, params.originalurl):
                    # Step1: 通过原始url得到moveid,得到获取评论的首页url。
                    movieid = self.r.parse(self.TYPE1, params.url)[0]
                    Logger.getlogging().debug(movieid)
                    commentinfo_url = KanKanComments.COMMENTS_URL1.format(
                        movieid=movieid, page=1, perpage=self.PERPAGE)
                    self.storeurl(commentinfo_url, params.originalurl,
                                  KanKanComments.STEP_2, {'movieid': movieid})
                elif self.r.match(self.TYPE2, params.originalurl):
                    # Step1: 通过原始url得到type和sid,得到获取评论的首页url
                    self.substep1(params, self.TYPE2)
                elif self.r.match(self.TYPE3, params.originalurl):
                    # Step1: 通过原始url得到type和sid,vchannel得到获取评论的首页url
                    self.substep1(params, self.TYPE3)

            elif params.step == KanKanComments.STEP_2:
                self.step2(params)
            elif params.step == KanKanComments.STEP_3:
                self.step3(params)
            elif params.step == KanKanComments.STEP_CLICK:
                self.step_click(params)
        except:
            Logger.printexception()
Пример #4
0
 def ifengnews_step2(self, params):
     try:
         oriurl = params.customized['oriurl']
         jsoncontent = json.loads(params.content)
         clicknum = float(jsoncontent.get('join_count', '-1'))
         if clicknum > 0:
             NewsStorage.setclicknum(params.originalurl, clicknum)
         curcmtnum = float(jsoncontent['count'])
         NewsStorage.setcmtnum(params.originalurl, curcmtnum)
         dbcmtnum = CMTStorage.getcount(params.originalurl, True)
         if dbcmtnum >= curcmtnum:
             return
         # 循环取得评论的url
         pages = int(math.ceil(
             float(curcmtnum - dbcmtnum) / self.page_size))
         if pages >= self.maxpages:
             pages = self.maxpages
     # 拼出第一页之外的其他所有评论url
         for index in range(1, pages + 1, 1):
             if index == 1:
                 self.ifengnews_step3(params)
                 continue
             commentinfo_url = IfengNewsComments.COMMENTS_URL.format(
                 oriurl=oriurl, pg=index, ps=self.page_size)
             self.storeurl(commentinfo_url, params.originalurl,
                           IfengNewsComments.IFENG_NEWS_NEXT_PAGE)
     except:
         Logger.printexception()
 def get(self, url):
     saveJson = {}
     try:
         Logger.getlogging().debug('Downloading: {url}'.format(url=url))
         request = urllib2.Request(url, headers=self.headers)
         response = urllib2.urlopen(request, timeout=self.timeout)
         code = response.getcode()
         info = response.info()
         # 判断返回的code,如果不是200,则返回空
         if code == 200:
             html = response.read()
             if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")):
                 html = zlib.decompress(html, 16 + zlib.MAX_WBITS);
             Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url))
         else:
             Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code))
             Logger.getlogging().error('Request Failed: {url}'.format(url=url))
             return None
     except:
         Logger.getlogging().error('Request   Failed: {url}'.format(url=url))
         Logger.printexception()
         return None
     charset = RegexUtility.getid('charset', html)
     html = Common.trydecode(html, charset)
     saveJson['foundin'] = Common.urlenc(url)
     saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8))
     saveJson['crawler_time'] = int(time.time())
     jsonStr = json.dumps(saveJson)
     return jsonStr     
Пример #6
0
    def step2(self, params):
        try:
            Logger.getlogging().info("xinhuaComments.STEP_2")
            # 将STEP_1中的commentinfo_url传下来
            newsId = params.customized['newsId']
            comments_info = json.loads(params.content)
            comments_count = comments_info['totalRows']
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            page_count = comments_info['totalPage']

            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            if cmtnum >= comments_count:
                return

            # 判断增量
            if page_count >= self.maxpages:
                page_count = self.maxpages

            for index in range(0, int(page_count)):
                commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format(
                    newsId=newsId, pid=(index + 1))
                self.storeurl(commentinfo_url, params.originalurl,
                              xinhuaNewsComments.STEP_3)
        except:
            Logger.printexception()
Пример #7
0
 def step3bbs(self, params):
     Logger.getlogging().info("JoyComments.STEP_3")
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     try:
         commentsinfo = json.loads(params.content)
         commentsinfo['result']['mainreplys']['rows']
     except:
         Logger.getlogging().warning(
             '{url} Errorcode:40000'.format(url=params.originalurl))
         Logger.printexception()
         return
     # 获取评论
     for index in range(
             0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1):
         # 提取时间
         # cmti = CommentInfo()
         content = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['reply']['body']['text']
         curtime = TimeUtility.getuniformtime(
             str(commentsinfo['result']['mainreplys']['rows'][index]
                 ['reply']['reply']['post_time']))
         nick = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['user']['name']
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
Пример #8
0
 def step1(self, params):
     try: 
         url = params.originalurl
         videoId = params.customized['videoId']
         params.content = params.content[params.content.index('{'):params.content.rindex('}')+1]
         jsonData = json.loads(params.content)['data']
         hasCmts = jsonData['page']['count']
         # 是否有评论
         if not hasCmts:
             return
         # 比较增量:获取现在评论数目及上次采集的评论数目
         currCmtsCount = jsonData['page']['acount']
         NewsStorage.setcmtnum(url, currCmtsCount)
         prevCmtsCount = int(CMTStorage.getcount(url))
         # 若没有评论更新,跳过
         if prevCmtsCount >= currCmtsCount:
             return
         # 更新评论数
         pageNum = int(math.ceil((hasCmts-prevCmtsCount)/self.pageSize))
         # 上次采集数据的截止时间         
         # 生成分页url并传递给共通模块
         for page in range(1, pageNum + 1):
             if page == 1:
                 self.step2(params)
             pageUrl = self.pageUrl.format(page = page, videoId = videoId)
             self.storeurl(pageUrl, url, self.STEP_CMTS)
     except:
         Logger.printexception()
Пример #9
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content,
                                             16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         if 'property' in js:
             for property in js['property']:
                 if not property.has_key('result'):
                     continue
                 if property['property_name'] == u'page_body':
                     param.page_body = Common.trydecode(
                         Common.urldec(property['result'][0]['text']),
                         constant.CHARSET_GBK)
                 elif property['property_name'] == u'page_title':
                     param.page_title = Common.trydecode(
                         Common.urldec(property['result'][0]['text']),
                         constant.CHARSET_GBK)
                 elif property['property_name'] == u'html_time':
                     param.html_time = TimeUtility.getuniformtime(
                         property['result'][0]['text'])
         return param
     except:
         Logger.printexception()
Пример #10
0
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        # if NewsStorage.getclicknum(proparam.originalurl) <= 0:
        #     contentid = self.r.parse('^http://www\.kumi\.cn/(\w+)/(\d+)(_\d)?\.html.*', proparam.originalurl)[1]
        #     clickurl = self.VIDEO_CLICKURL.format(contentid=contentid)
        #     self.storeurl(clickurl, proparam.originalurl, self.STEP_TVCLICK)

        try:
            if self.r.search('^http://.*\.kumi\.cn/.*', proparam.originalurl):
                contentid = self.r.parse(
                    '^http://www\.kumi\.cn/(\w+)/(\d+)(_\d)?\.html.*',
                    proparam.originalurl)[1]
                clickurl = self.VIDEO_CLICKURL.format(contentid=contentid)
                self.storeurl(clickurl, proparam.originalurl,
                              self.STEP_TVCLICK)
                if 'donghua' in proparam.originalurl:
                    topicSourceId = re.findall(
                        r'^http://www\.kumi\.cn/donghua/(\d+)(_\d)?\.html',
                        proparam.originalurl).__getitem__(0).__getitem__(0)
                else:
                    topicSourceId = re.findall(
                        r'^http://xiaoyouxi\.kumi\.cn/(\d+)(_\d)?\.htm',
                        proparam.originalurl).__getitem__(0).__getitem__(0)
                self.createobject().getcomments(proparam, topicSourceId, 3, 2)
            elif proparam.setp == self.STEP_TVCLICK:
                self.setclicknum_tv(proparam)

        except:
            Logger.printexception()
Пример #11
0
 def delete(self, table, where, relation='and', retrycount=0):
     if self.check():
         if where and isinstance(where, dict):
             tempwhere = [
                 ' {key}=\"{value}\" '.format(key=key, value=value)
                 for key, value in where.iteritems()
             ]
             tempwhere = 'and'.join(tempwhere)
             sql = SQLDAO.DELETETABLE.format(table=table, where=tempwhere)
         else:
             sql = SQLDAO.DELETETABLE.format(table=table, where=where)
         Logger.getlogging().debug(sql)
         try:
             cur = self.connect.cursor()
             cur.execute(sql)
             cur.close()
             self.connect.commit()
             return True
         except:
             if retrycount == SQLDAO.MAX_RETRY_TIMES:
                 Logger.getlogging().error(sql)
                 Logger.printexception()
                 return False
         retrycount += 1
         return self.delete(table, where, relation, retrycount)
Пример #12
0
 def insert(self, table, keys, values, mutli=False, retrycount=0):
     if self.check():
         sql = SQLDAO.INSERTTABLE.format(table=table,
                                         keys=', '.join(keys),
                                         values=', '.join(['%s'] *
                                                          len(keys)))
         try:
             cur = self.connect.cursor()
             if not mutli:
                 tempvalues = [
                     '\"{value}\"'.format(value=item) for item in values
                 ]
                 Logger.getlogging().debug(sql % tuple(tempvalues))
                 cur.execute(sql, tuple(values))
             else:
                 tempvalues = []
                 for value in values:
                     tempvalue = [
                         '\"{value}\"'.format(value=item) for item in value
                     ]
                     Logger.getlogging().debug(sql % tuple(tempvalue))
                     tempvalues.append(tuple(value))
                 cur.executemany(sql, tempvalues)
             cur.close()
             self.connect.commit()
             return True
         except:
             if retrycount == SQLDAO.MAX_RETRY_TIMES:
                 Logger.getlogging().error(sql)
                 Logger.printexception()
                 return False
         retrycount += 1
         return self.insert(table, keys, values, mutli, retrycount)
    def setclick(self, params):
        try:
            content = json.loads(params.content)
            # content=[播放量,评论,X,X,弹幕,收藏数,投焦数,X]
            cmtnum = content[1]
            clicknum = content[0]
            votenum = content[-2]
            fansnum = content[-3]
            if not cmtnum:
                cmtnum = 0
            if not clicknum:
                clicknum = 0
            if not votenum:
                votenum = 0
            if not fansnum:
                fansnum = 0
            NewsStorage.seturlinfo(params.originalurl,
                                   data={
                                       SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: cmtnum,
                                       SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM:
                                       clicknum,
                                       SQLDAO.SPIDER_TABLE_NEWS_VOTENUM:
                                       votenum,
                                       SQLDAO.SPIDER_TABLE_NEWS_FANSNUM:
                                       fansnum
                                   })

        except:
            Logger.printexception()
 def step3(self, params):
     """通过评论的url获取评论"""
     #相对之前的版本,本次更新变动:
     #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户
     #存储的内容增加了 评论发布时间, 用户
     try:
         jsondata = json.loads(params.content)
         if jsondata['comments']:
             for comment in jsondata['comments']:
                 content = comment['content']
                 curtime = TimeUtility.getuniformtime(
                     comment['create_time'])
                 nick = comment['passport']['nickname']
                 if not CMTStorage.exist(params.originalurl, content,
                                         curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content,
                                         curtime, nick)
                 reply = comment['comments']
                 while reply:
                     for comment in comment['comments']:
                         content = comment['content']
                         curtime = TimeUtility.getuniformtime(
                             comment['create_time'])
                         nick = comment['passport'].get(
                             'nickname', 'anonymous')
                         if not CMTStorage.exist(params.originalurl,
                                                 content, curtime, nick):
                             CMTStorage.storecmt(params.originalurl,
                                                 content, curtime, nick)
                         reply = comment['comments']
     except:
         Logger.printexception()
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
Пример #15
0
 def baidutiebasearch_step3(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     post_list = soup.select('.s_post_list > .s_post')
     urllist = []
     for item in post_list:
         try:
             title = item.select_one('.p_title > a').get_text().strip()
             href = item.select_one('.p_title > a').get('href') 
             pubtimeobj = item.find(attrs={'class':'p_green p_date'})
             if not pubtimeobj:
                 Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
                 continue
             pubtime = pubtimeobj.get_text()
             pubtime = getuniformtime(pubtime)
             Logger.getlogging().debug(title)
             Logger.getlogging().debug(pubtime)
             if self.isyestoday(pubtime):
                 Logger.getlogging().debug('https://tieba.baidu.com'+href)
                 urllist.append('https://tieba.baidu.com'+href) 
             else:
                 Logger.log(params.url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)        
Пример #16
0
    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>,过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
 def step2(self, params):
     """获取评论的url"""
     try:
         newsId = params.customized['newsId']
         jsondata = json.loads(params.content)
         backflag = False
         if jsondata:
             comments = []
             for comment in jsondata:
                 cmti = CommentInfo()
                 if URLStorage.storeupdatetime(params.originalurl,
                                               str(comment['commentTime'])):
                     cmti.content = comment['commentContent']
                     cmti.commentid = comment['commentId']
                     comments.append(cmti)
                 else:
                     backflag = True
             self.commentstorage.store(params.originalurl, comments)
             if backflag == False:
                 self.commentstorage.store(params.originalurl, comments)
                 self.pageno += 1
                 comment_url = self.COMMENTS_URL.format(
                     self.pageno, self.page_size, newsId)
                 self.storeurl(comment_url, params.originalurl,
                               self.STEP_COMMENT_FIRST_PAGE,
                               {'newsId': newsId})
     except:
         Logger.printexception()
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))    
Пример #19
0
 def step2(self,params):
     soup = BeautifulSoup(params.content, 'html5lib')
     if soup.find(attrs={"id":re.compile('noresult_part._container')}):
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     results = soup.select('.results > .vrwrap')
     if not results:
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     urllist = []
     newurllist = []
     for item in results:
         try:
             if not item.select_one('h3.vrTitle > a'):
                 continue
             title = item.select_one('h3.vrTitle > a').get_text()
             href = item.select_one('h3.vrTitle > a').get('href')
             timestr = item.select_one('.news-detail > .news-info > .news-from').get_text()
             times = getuniformtime(timestr)
             Logger.getlogging().debug('title:'+ title)
             Logger.getlogging().debug('time:'+ times)
             if compareNow(times, self.querylastdays):
                 Logger.getlogging().debug('href:'+ href)
                 urllist.append(href)
             newitem = item.select_one('#news_similar')
             if newitem:
                 newhref = 'http://news.sogou.com/news'+newitem.get('href')
                 Logger.getlogging().debug('newhref:'+ newhref)
                 newurllist.append(newhref)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)      
     if len(newurllist) > 0:
         self.__storeqeuryurllist__(newurllist, self.NEWS_EACH_2)
    def process(self, params):
        try:
            if params.step is None:
                # 从html源码中获取拼接评论url的参数
                operaId = self.r.getid('operaId', params.content, '\s*:\s*')
                contentId = self.r.getid('contentId', params.content,
                                         '\s*:\s*')
                # 拼接第一页评论url
                comments_url = Comments.COMMENTS_URL % (operaId, contentId, 1,
                                                        Comments.PAGE_SIZE)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              Comments.STEP_2, {
                                  'operaId': operaId,
                                  'contentId': contentId
                              })

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == Comments.STEP_2:
                self.step2(params)

            #解析评论数据
            elif params.step == Comments.STEP_3:
                self.step3(params)
        except:
            Logger.printexception()
Пример #21
0
    def step2(self, params):
        try:
            Logger.getlogging().info("Kr36Comments.STEP_2")
            # 将STEP_1中的cid传下来
            cid = params.customized['cid']

            jsoncontent = json.loads(params.content)
            comments_count = jsoncontent['data']['total_items']
            page_count = jsoncontent['data']['total_pages']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl)
            if cmtnum >= comments_count:
                return

            #最多只取十页评论
            # page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size))
            if page_count >= self.maxpages:
                page_count = self.maxpages
            lasttime = CMTStorage.getlastpublish(params.originalurl,True)

            for page in range(1, page_count+1, 1):
                commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, page)
                self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_3,lasttime)
        except:
            Logger.printexception()
Пример #22
0
def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)
Пример #23
0
 def step1(self, params):
     """获取评论的首页url"""
     try:
         #获取上一次的最新更新时间
         before_update = CMTStorage.getlastpublish(params.originalurl)
         #获取其他信息,拼接url
         url_id = None
         if self.r.search('^http[s]{0,1}://v\.qq\.com/.*',
                          params.originalurl):
             #{"comment_id":"1167760750","result":{"code":0,"msg":"Success!","ret":0},"srcid":"c0016r7fo07","srcid_type":1001}
             url_id = self.r.getid('comment_id', params.content)
         else:
             url_id = self.r.getid('cmt_id', params.content)
             if not url_id:
                 url_id = self.r.getid('aid', params.content)
             if not url_id:
                 url_id = self.r.getid('commId', params.content)
         if url_id:
             comment_url = self.COMMENTS_URL.format(url_id, 0,
                                                    self.page_size)
             self.storeurl(
                 comment_url, params.originalurl,
                 self.STEP_COMMENT_NEXT_PAGE, {
                     'url_id': url_id,
                     'comment_id': 0,
                     'before_update': before_update
                 })
     except:
         Logger.printexception()
Пример #24
0
    def step2_ifeng_xiaobg(self, params):
        try:
            jsoncontent = json.loads(params.content)
            clicknum = float(jsoncontent.get('join_count', '-1'))
            if clicknum > 0:
                NewsStorage.setclicknum(params.originalurl, clicknum)
            curcmtnum = jsoncontent['count']
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
            dbcmtnum = CMTStorage.getcount(params.originalurl, True)
            if dbcmtnum >= curcmtnum:
                return
# 循环取得评论的url
            pages = int(math.ceil(
                float(curcmtnum - dbcmtnum) / self.page_size))
            if pages >= self.maxpages:
                pages = self.maxpages
            for index in range(1, pages + 1, 1):
                if index == 1:
                    self.ifengnews_step3(params)
                    continue
                self.post_data['p'] = index
                self.storeposturl(self.post_url, params.originalurl,
                                  self.IFENG_NEWS_NEXT_PAGE,
                                  IfengNewsComments.post_data)
        except:
            Logger.printexception()
Пример #25
0
 def gets2url(self, params):
     # 获取文本
     contents = json.loads(params.content)
     query = Common.urldec(params.customized['query'])
     urllist = []
     for item in contents['video_list']:
         try:
             vid = item['vid']
             if item.get('categoryName', '') == u"体育":
                 url = 'http://sports.le.com/video/{vid}.html'.format(
                     vid=vid)
             else:
                 url = 'http://www.le.com/ptv/vplay/{vid}.html'.format(
                     vid=vid)
             curtime = item['ctime']
             #print TimeUtility.getuniformtime(curtime)
             title = item['name']
             if self.compareNow(curtime):
                 if self.checktitle(query, title):
                     #Logger.getlogging().info(title)
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     # 获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def step2(self, params):
        """获取评论的其他url"""
        try:
            comments = json.loads(params.content)
            topic_id = comments['topic_id']
            curcmtnum = float(comments.get('cmt_sum', -1))
            #clicknum = float(comments.get('participation_sum',-1))
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
            #NewsStorage.setclicknum(params.originalurl, clicknum)

            dbcmtnum = CMTStorage.getcount(params.originalurl, True)
            if dbcmtnum >= curcmtnum:
                return
            page_num = int(
                math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
            if page_num >= self.maxpages:
                page_num = self.maxpages
            for page in range(1, page_num + 1):
                if self.r.search('http[s]{0,1}://.*tv\.sohu.com/.*',
                                 params.originalurl):
                    url = self.COMMENTS_URL.format(self.tv_client_id, topic_id,
                                                   page, self.tv_page_size)
                else:
                    url = self.COMMENTS_URL.format(self.client_id, topic_id,
                                                   page, self.page_size)
                self.storeurl(url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except:
            Logger.printexception()
            Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
Пример #27
0
    def step2(self, params):
        keyword = params.customized['keyword']
        query = Common.urldec(keyword)
        jsondata = json.loads(params.content)
        # 获取分页数
        html = jsondata['html']
        soup = bs(html, 'html5lib')
        videoUrlList = []

        videoList = soup.select('li.video')
        for video in videoList:
            try:
                videoUrl = 'https:' + video.select_one('a').get('href')
                videoUrl = videoUrl.split('?')[0] + '/'
                title = video.select_one('a').get('title')
                pubtime = video.find(attrs={
                    'class': 'so-icon time'
                }).get_text().strip()
                if self.compareNow(TimeUtility.getuniformtime(pubtime)):
                    if self.checktitle(query, title):
                        videoUrlList.append(videoUrl)
                        self.__storeurl__(videoUrl, pubtime,
                                          SPIDER_S2_WEBSITE_VIDEO)
                    else:
                        Logger.log(videoUrl,
                                   constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                else:
                    Logger.log(videoUrl,
                               constant.ERRORCODE_WARNNING_NOMATCHTIME)
            except:
                Logger.printexception()
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is LeiphoneComments.STEP_1:
                # 取得url中的id
                articleId = self.r.getid('data-article_id', proparam.content)
                comments_url = LeiphoneComments.COMMENTS_URL % (articleId)
                self.storeurl(comments_url, proparam.originalurl,
                              LeiphoneComments.STEP_2,
                              {'articleId': articleId})

            elif proparam.step == LeiphoneComments.STEP_2:
                articleId = proparam.customized['articleId']
                comments = proparam.content[
                    proparam.content.index('{'):proparam.content.rindex('}') +
                    1]
                comments = json.loads(comments)
                comments_count = float(comments['allCount']['num'])
                NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                # 取得评论件数
                if int(comments_count) == 0:
                    return

                # 判断增量
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return

                # 取得评论
                self.geturlcomments(proparam)

                # 取得评论url
                # comments_url = LeiphoneComments.COMMENTS_URL % (articleId)
                # self.storeurl(comments_url, proparam.originalurl, LeiphoneComments.STEP_3)

            elif proparam.step == LeiphoneComments.STEP_3:
                return
                # # 取得评论的正则表达式
                # comments = re.findall(r'content":"(.+?)","paragraph_id"', proparam.content)
                # commentsInfo = []
                # commentsTime = self.r.parse(r'origin_created":"(\d+)","member_avatarPath"', proparam.content)
                # # 取得评论
                # index = 0
                # for comment in comments:
                #     comment = eval('u"' + comment + '"')
                #     cmti = CommentInfo()
                #     cmti.content = comment.encode('utf-8')
                #     if URLStorage.storeupdatetime(proparam.originalurl, getuniformtime(commentsTime[index])):
                #         commentsInfo.append(cmti)
                #     index = index + 1
                #
                # # 保存获取的评论
                # if len(commentsInfo) > 0:
                #     self.commentstorage.store(proparam.originalurl, commentsInfo)
            else:
                return

        except:
            Logger.printexception()
Пример #29
0
 def set_click(self, params):
     try:
         vid = params.customized['vid']
         data = json.loads(params.content)
         clicknum = data['data'][vid]
         NewsStorage.setclicknum(params.originalurl, clicknum)
     except:
         Logger.printexception()
Пример #30
0
 def process(self, params):
     try:
         if params.step == self.STEP_PAGES:
             self.step1(params)
         elif params.step == self.STEP_VIDEOS:
             self.step2(params)
     except:
         Logger.printexception()