def step2_ifeng_xiaobg(self, params): try: jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = jsoncontent['count'] NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue self.post_data['p'] = index self.storeposturl(self.post_url, params.originalurl, self.IFENG_NEWS_NEXT_PAGE, IfengNewsComments.post_data) except: Logger.printexception()
def setclick(self, params): playcount = self.r.getid('play_count', params.content) votenum = self.r.getid('up', params.content) if playcount: NewsStorage.setclicknum(params.originalurl, playcount) if votenum: NewsStorage.setvotenum(params.originalurl, votenum)
def ifengnews_step2(self, params): try: oriurl = params.customized['oriurl'] jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = float(jsoncontent['count']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages # 拼出第一页之外的其他所有评论url for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue commentinfo_url = IfengNewsComments.COMMENTS_URL.format( oriurl=oriurl, pg=index, ps=self.page_size) self.storeurl(commentinfo_url, params.originalurl, IfengNewsComments.IFENG_NEWS_NEXT_PAGE) except: Logger.printexception()
def setclick(self, params): soup = BeautifulSoup(params.content, 'html5lib') #电视剧 itemcount = soup.select('.mod_episode > .item') if itemcount: total = self.str2num( soup.select_one('#mod_cover_playnum').get_text()) clicknum = total / len(itemcount) NewsStorage.setclicknum(params.originalurl, clicknum) return #其他 parentid = params.originalurl.split('.')[-2].split('/')[-1] #figures_list = soup.find_all(attrs={'class':re.compile('^figures?_list$')}) for fitem in soup.find_all( attrs={'class': re.compile('^figures?_list$')}): #list_items = fitem.find_all(attrs={'class':re.compile('list_item')}) for item in fitem.find_all( attrs={'class': re.compile('list_item')}): childurl = item.select_one('a').get('href', None) childid = childurl.split('.')[-2].split('/')[-1] #Logger.getlogging().debug('childid:'+childid+'\t'+'parentid:'+parentid) if childid == parentid: numobj = item.find(attrs={ 'class': re.compile('num _video_playnum|figure_num') }) if not numobj: continue clicknum = self.str2num(numobj.get_text()) NewsStorage.setclicknum(params.originalurl, clicknum) return
def step1(self, params): Logger.getlogging().info("MkzhanComments.STEP_1") # # 取得html中的commentType # comment_type = self.r.getid('commentType', params.content) # # # 取得html中的aboutid # aboutid = self.r.getid('aboutid', params.content) # if not comment_type or not aboutid: # Logger.getlogging().warning('{url}:40000 No commentType or No aboutid'.format(url=params.originalurl)) # return if NewsStorage.getclicknum(params.originalurl) <= 0: if self.r.search('<span>人气:\s<b>(.*?)<\/b>', params.content): clicknum = self.r.parse('<span>人气:\s<b>(.*?)<\/b>', params.content)[0] NewsStorage.setclicknum(params.originalurl, clicknum) # 获取comic_id comic_id = int( self.r.parse(r'^http[s]?://www\.mkzhan\.com/(\d+)/.*', params.originalurl)[0]) if not comic_id: return # 取得评论url comments_url = MkzhanComments.COMMENTS_URL % (comic_id, 1, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_2, {'comic_id': comic_id})
def geturlplaycount(self, params): tvid = params.customized['tvid'] if not self.r.search(tvid, params.content): Logger.log(params.url, constant.ERRORCODE_WARNNING_OTHERS) return playcount = self.r.getid(tvid, params.content) if playcount is not None: NewsStorage.setclicknum(params.originalurl, playcount)
def setclick(self, proparam): try: clicknum = str(re.findall(r'"jm-icon icon-a-collect"></i><span>(.*?)</span>', proparam.content)) clicknum = self.str2num(clicknum) Logger.getlogging().debug('{url} clicknum:{clicknum}'.format(url=proparam.originalurl, clicknum=clicknum)) NewsStorage.setclicknum(proparam.originalurl, clicknum) except: Logger.printexception()
def set_click(self, params): try: vid = params.customized['vid'] data = json.loads(params.content) clicknum = data['data'][vid] NewsStorage.setclicknum(params.originalurl, clicknum) except: Logger.printexception()
def setclicknum(self,params): try: jsondate = json.loads(params.content) todayplaynum = jsondate['cmtVote'] publishdate = jsondate['createTime'] NewsStorage.setclicknum(params.originalurl, todayplaynum) NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) except: Logger.printexception()
def getclicknum(self, params): try: data = json.loads(params.content) video_id = params.customized['video_id'] clicknum = int(data['data']['video_id'][video_id]) except: Logger.getlogging().warning('{}:30001'.format(params.originalurl)) return NewsStorage.setclicknum(params.originalurl, clicknum)
def setclick(self, params): try: pattern = u'<li>播放:(.*)</li>?' if self.r.search(pattern, params.content): clicknum = self.r.parse(pattern, params.content)[0] #clicknum = params.content.split(u'<li>播放:')[1].split('</li>')[0] clicknum = self.str2num(clicknum) NewsStorage.setclicknum(params.originalurl, clicknum) except: Logger.printexception()
def step_click(self, params): sid = params.customized['sid'] infodata = json.loads(params.content) for info in infodata: if info['id'] == str(sid): addtime = TimeUtility.getuniformtime(info['adddate']) playcount = self.str2num(info['playtimes']) NewsStorage.setclicknum(params.originalurl, playcount) NewsStorage.setpublishdate(params.originalurl, addtime) break
def getclick(self, params): print params.content.replace('\n', ' ').replace('\r', '') pattern1 = '<click>(\d+)</click>' pattern2 = '<click>(\d+)</click>' if self.r.search(pattern1, params.content): click = self.r.parse(pattern1, params.content)[0] NewsStorage.setclicknum(params.originalurl, int(click)) elif self.r.search(pattern2, params.content): click = self.r.parse(pattern2, params.content)[0] NewsStorage.setclicknum(params.originalurl, int(click)) else: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
def setclicknum(self, params): pattern = 'var\sG_PLAY_VV\s=\s\{\stotal:\"(.*)\"\s\}' if self.r.search(pattern, params.content): playcount = self.r.parse(pattern, params.content)[0] playcount = playcount.replace(',', '') pattern2 = 'subtype:\s(\[.*\])?,' if self.r.search(pattern2, params.content): subtype = self.r.parse(pattern2, params.content)[0] subtype = eval(subtype) num = self.count(subtype) if num != 0: playcount = int(playcount) / num NewsStorage.setclicknum(params.originalurl, playcount)
def yuedu_step1(self, params): # 网易云阅读处理 field = self.r.parse('^http://.*yuedu.163.com/(.*?)/.*', params.originalurl)[0] if field == "source": idvalue = self.r.parse('^http://.*yuedu.163.com/source/(\w+_\d)', params.originalurl)[0] commentinfo_url = YueduComments.COMMENT_URL.format(types='2', id=idvalue, pageno='1') self.storeurl(commentinfo_url, params.originalurl, YueduComments.YUEDU_STEP_2, { 'id': idvalue, 'field': 'yuedu', 'types': '2' }) else: idvalue2 = re.findall('(\w+_\d+)', params.originalurl)[1] idvalue3 = re.findall('(\w+_\d+)', params.originalurl)[0] commentinfo_url2 = YueduComments.COMMENT_URL.format(types='0', id=idvalue2, pageno='1') commentinfo_url3 = YueduComments.COMMENT_URL.format(types='1', id=idvalue3, pageno='1') self.storeurl(commentinfo_url3, params.originalurl, YueduComments.YUEDU_STEP_2, { 'id': idvalue3, 'field': 'yuedu', 'types': '1' }) self.storeurl(commentinfo_url2, params.originalurl, YueduComments.YUEDU_STEP_2, { 'id': idvalue2, 'field': 'yuedu', 'types': '0' }) try: clicknum = self.r.parse(ur'<td>点击:</td><td>(.*?)</td>', params.content)[0] clicknum = self.str2num(clicknum) Logger.getlogging().debug('{url} clicknum:{clicknum}'.format( url=params.originalurl, clicknum=clicknum)) NewsStorage.setclicknum(params.originalurl, clicknum) except: Logger.printexception()
def step2(self, params): newsid = params.customized['newsid'] channel = params.customized['channel'] group = params.customized['group'] comments = json.loads(params.content) if not self.isvalid(comments): Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return # 获取视频的publishdate if self.r.search('http[s]{0,1}://.*video\.sina\.com.*', params.originalurl): publishdate = comments['result']['news']['time'] NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) # 获取新闻的clicknum elif self.r.search('http[s]{0,1}://.*\.sina\.com.*', params.originalurl): if NewsStorage.getclicknum(params.originalurl) <= 0: try: news_clicknum = comments['result']['count']['total'] NewsStorage.setclicknum(params.originalurl, news_clicknum) except: Logger.printexception() comments_count = int(comments['result']['count']['show']) #设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return pages = int(math.ceil(float(comments_count - cmtnum) / self.DEFAULT_PAGE_SIE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue url = CommonComments.SINA_COMMENTS_URL.format(channel=channel, newsid=newsid, pn=page, ps=SinaComments.DEFAULT_PAGE_SIE) if group: url = url + '&group=' + group self.storeurl(url, params.originalurl, SinaComments.STEP_COMMENT_NEXT_PAGE)
def step1(self, params): field = self.r.parse('^http://(\w+)\.duowan\.com*', params.originalurl)[0] if field == 'video': if self.r.parse('qianwangbieyongzhegebianliang_domain\s*=\s*\"(.*?)\";',params.content).__len__()>0: domain = self.r.parse('qianwangbieyongzhegebianliang_domain\s*=\s*\"(.*?)\";',params.content)[0] pUrl = self.r.parse('qianwangbieyongzhegebianliang_url\s*=\s*\"(.*?)\";',params.content)[0] else: domainUrl = params.originalurl[7:params.originalurl.__len__()] domain = domainUrl[0:domainUrl.index('/')] pUrl = domainUrl[domainUrl.index('/'):domainUrl.index('.html') + 5] else: domainUrl = params.originalurl[7:params.originalurl.__len__()] domain = domainUrl[0:domainUrl.index('/')] pUrl = domainUrl[domainUrl.index('/'):domainUrl.index('.html')+5] # 图片 if field == 'tu': getByGallery_url = 'http://tu.duowan.com/index.php?r=show/getByGallery/&gid=' + domainUrl[domainUrl.rfind('/')+1:domainUrl.index('.html')] Logger.getlogging().debug(getByGallery_url) self.storeurl(getByGallery_url, params.originalurl, DuowanComments.STEP_2_TU, {'domain' : domain, 'pUrl' : pUrl}) else: uniqid = self.r.getid('comment3Uniqid', params.content, '\s*=\s*') # 拼接总评论数的url comment_counts_url = DuowanComments.COMMENT_COUNTS_URL % (uniqid, domain, pUrl) self.storeurl(comment_counts_url, params.originalurl, DuowanComments.STEP_2, {'uniqid': uniqid, 'domain' : domain, 'pUrl' : pUrl}) #new video,need to find real originalurl,so real_originalurl=originalurl.replace('.com','.cn') soup = BeautifulSoup(params.content, 'html5lib') if soup.select_one('#dw-video-wrap'): real_originalurl = params.originalurl.replace('.com','.cn') self.storeurl(real_originalurl, params.originalurl, DuowanComments.STEP_1) clicknumobj = soup.select_one('.vcol-main-hd > strong') if clicknumobj: clicknum = clicknumobj.get_text() clicknum = self.str2num(clicknum) NewsStorage.setclicknum(params.originalurl, clicknum)
def process(self, params): try: if params.step is YoukuComments.STEP_1: # 从url中获取拼接评论url的参数 objectId = self.r.getid('videoId', params.content, '\s*:\s*"') pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) #获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 拼接第一页评论url comments_url = YoukuComments.COMMENTS_URL % ( objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) # 来疯吧直播播放量 if self.r.search(r'^http://v\.laifeng\.com/\d+', params.originalurl): clicknum = int(self.r.getid('onlineNum', params.content)) NewsStorage.setclicknum(params.originalurl, clicknum) if objectId: playinfo_url = YoukuComments.PLAYINFO_URL.format( vid=objectId) self.storeurl(playinfo_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) #获取第一页评论内容,循环获取全部评论url elif params.step == YoukuComments.STEP_2: if re.findall('getVideoPlayInfo\?vid', params.url): playinfo = json.loads((params.content)[20:-2]) clicknum = int(playinfo['data']['stat']['vv'].replace( ',', '')) votenum = int(playinfo['data']['updown']['up'].replace( ',', '')) NewsStorage.setclicknum(params.originalurl, clicknum) NewsStorage.setvotenum(params.originalurl, votenum) else: objectId = params.customized['objectId'] pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 if not comments.has_key('data'): Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return if not comments['data']: Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return # 判断增量 comments_count = comments['data']['totalSize'] cmtnum = CMTStorage.getcount(params.originalurl, True) if int(comments_count <= cmtnum): return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论总页数 comments_pages = int(comments['data']['totalPage']) if comments_pages == 0: return # 如果评论数量过多只取前十页 if comments_pages >= self.maxpages: comments_pages = self.maxpages lasttime = CMTStorage.getlastpublish( params.originalurl, True) # 循环拼接评论url,提交下载平台获取评论数据 for page in range(0, comments_pages + 1, 1): commentUrl = YoukuComments.COMMENTS_URL % ( objectId, page + 1, YoukuComments.PAGE_SIZE, sign, pTime) self.storeurl(commentUrl, params.originalurl, YoukuComments.STEP_3, {'objectId': objectId}) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['totalSize'])) #解析评论数据 elif params.step == YoukuComments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: content = str(comment['content']) curtime = TimeUtility.getuniformtime( int(comment['createTime'])) nick = comment['user']['userName'] # 通过时间判断评论增量 # if curtime > lasttime: if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def process(self, params): try: if params.step is None: # 从url中获取拼接评论url的参数 if not self.r.search( '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$', params.originalurl): return galleryid = self.r.parse( '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$', params.originalurl)[0][0] # 拼接第一页评论url comments_url = FunComments.COMMENTS_URL % (galleryid, 1) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, FunComments.STEP_2, {'galleryid': galleryid}) #直接通过拼页面获取,除电视剧存在此种合辑问题,其他都可以直接获取 xhtml = XPathUtility(params.content) torrent_panel = xhtml.xpath('//*[@class="torrent-panel"]') if torrent_panel: lis = xhtml.xpath('//*[@class="torrent-panel"]/ul/li') if len(lis) == 0: return numobj = xhtml.xpath( '//*[@class="playInfo crumbs"]/div/a[@class="exp-num"]' ) if numobj: clicknum = self.str2num(numobj[0].text) new_clicknum = int(clicknum) / len(lis) NewsStorage.setclicknum(params.originalurl, new_clicknum) #获取第一页评论内容,循环获取全部评论url elif params.step == FunComments.STEP_2: galleryid = params.customized['galleryid'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 curcmtnum = int(comments['data']['total_num']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue commentUrl = FunComments.COMMENTS_URL % (galleryid, page) self.storeurl(commentUrl, params.originalurl, FunComments.STEP_3, {'galleryid': galleryid}) #解析评论数据 elif params.step == FunComments.STEP_3: self.step3(params) except: Logger.printexception()
def setclicknum_mytv(self, params): todayplaynum = self.r.getid("count", params.content) NewsStorage.setclicknum(params.originalurl, todayplaynum)
def setclicknum_tv(self, params): todayplaynum = self.r.getid("total", params.content, split=':') NewsStorage.setclicknum(params.originalurl, todayplaynum)
def setclicknum_tv(self, proparam): jsondate = json.loads(proparam.content) todayplaynum = 222 NewsStorage.setclicknum(proparam.originalurl, todayplaynum)
def process(self, params): try: if params.step is QidianComments.STEP_1: field = self.r.parse(r'^http://\w+\.(\w+)\.com*', params.url)[0] if field == 'yuncheng': html = etree.HTML(params.content) bookid = html.xpath( '//div[@class="operatebtn"]/ul/li/a/@href')[0] bookid = self.r.parse('\d+', bookid)[0] else: bookid = self.r.parse('\d+', params.url)[0] comments_url = QidianComments.COMMENTS_URL % (bookid, '1') self.storeurl(comments_url, params.originalurl, QidianComments.STEP_3, { 'bookid': bookid, 'pageno': '1' }) if self.r.search('http[s]{0,1}://.*\.qidian\.com/info/.*', params.originalurl): # 修正clicknum if self.r.search('class=\"book-info \"', params.content): soup = BeautifulSoup(params.content, 'html5lib') clicknumstr = str(soup.select('div.book-info p em')[1]) clicknumstr = clicknumstr[4:-5] if self.r.search('\.', clicknumstr): clicknum = self.str2num(clicknumstr + '万') NewsStorage.setclicknum(params.originalurl, clicknum) else: clicknum = clicknumstr NewsStorage.setclicknum(params.originalurl, clicknum) elif params.step is QidianComments.STEP_3: # html=etree.HTML(params.content) threadid = int( self.r.parse(ur'forumId = \'(.*?)\'', params.content)[0]) comment_counts = self.r.parse( ur'class="nav-tab.*act">全部(.*?)</a>', params.content)[0].strip() comment_counts = int(comment_counts[1:-1]) # 设置cmtnum if comment_counts: NewsStorage.setcmtnum(params.originalurl, comment_counts) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comment_counts: return page_num = int( math.ceil( float(comment_counts - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1, 1): if page == 1: self.getcontents(params) continue commentUrl = QidianComments.COMMENTS_URL_PAGE % ( threadid, page) self.storeurl(commentUrl, params.originalurl, QidianComments.STEP_4) elif params.step is QidianComments.STEP_4: self.getcontents(params) except Exception, e: Logger.printexception()