def step1(self, params): Logger.getlogging().info("MkzhanComments.STEP_1") # # 取得html中的commentType # comment_type = self.r.getid('commentType', params.content) # # # 取得html中的aboutid # aboutid = self.r.getid('aboutid', params.content) # if not comment_type or not aboutid: # Logger.getlogging().warning('{url}:40000 No commentType or No aboutid'.format(url=params.originalurl)) # return if NewsStorage.getclicknum(params.originalurl) <= 0: if self.r.search('<span>人气:\s<b>(.*?)<\/b>', params.content): clicknum = self.r.parse('<span>人气:\s<b>(.*?)<\/b>', params.content)[0] NewsStorage.setclicknum(params.originalurl, clicknum) # 获取comic_id comic_id = int( self.r.parse(r'^http[s]?://www\.mkzhan\.com/(\d+)/.*', params.originalurl)[0]) if not comic_id: return # 取得评论url comments_url = MkzhanComments.COMMENTS_URL % (comic_id, 1, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_2, {'comic_id': comic_id})
def common_step1(self, params): Logger.getlogging().info(params.originalurl) try: #field = self.r.parse('^http://(\w+)\.163.com?', params.originalurl)[0] field = params.originalurl.split('//')[-1].split('.163')[0].split('.')[-1] sid = params.originalurl.split('/')[-1].split('.')[0] #productkey = self.r.parse('\"productKey\" : \"(\w+)\"',params.content)[0] productkey = self.r.getid('productKey',params.content) commentinfo_url = Comments163.COMMENTS_URL.format(field=field, sid=sid, productkey=productkey, itemnum=0, itemlimit=self.limit) self.storeurl(commentinfo_url, params.originalurl, Comments163.STEP_2,{'field': field, 'sid': sid, 'productkey': productkey}) except: Logger.printexception() if NewsStorage.getclicknum(params.originalurl) <= 0: if self.r.search('^http[s]{0,1}://comment\.(\w+)\..*', params.url): field = self.r.parse('^http[s]{0,1}://comment\.(\w+)\..*', params.url)[0] else: field = self.r.parse('^http[s]{0,1}://(\w+)\..*', params.url)[0] if field == 'discovery' or field == 'data' or field == 'view': field = 'news' if field == 'cai': field = 'sports' if field != 'gongyi': if not self.r.search('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content): Logger.getlogging().warning('{url} Errorcode:40000 No productKey'.format(url=params.url)) return productKey = \ self.r.parse('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0] if not self.r.search('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content): Logger.getlogging().warning('{url} Errorcode:40000 No docId'.format(url=params.url)) return docId = self.r.parse('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0] else: if self.r.search('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content): productKey = \ self.r.parse('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0] else: Logger.getlogging().warning('{url}:40000 No productKey'.format(url=params.url)) productKey = 'a2869674571f77b5a0867c3d71db5856' if self.r.search('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content): docId = \ self.r.parse('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0] elif self.r.search('^http[s]{0,1}://.*\.163\.com/.*/(\w+).html', params.originalurl): docId = self.r.parse('/(\w+).html', params.originalurl)[0] else: Logger.getlogging().warning('{url}:40000 No docId'.format(url=params.url)) return clickurl = self.CLICKNUM_URL.format(key=productKey, docid=docId) self.storeurl(clickurl, params.originalurl, self.STEP_4)
def step2(self, params): newsid = params.customized['newsid'] channel = params.customized['channel'] group = params.customized['group'] comments = json.loads(params.content) if not self.isvalid(comments): Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return # 获取视频的publishdate if self.r.search('http[s]{0,1}://.*video\.sina\.com.*', params.originalurl): publishdate = comments['result']['news']['time'] NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) # 获取新闻的clicknum elif self.r.search('http[s]{0,1}://.*\.sina\.com.*', params.originalurl): if NewsStorage.getclicknum(params.originalurl) <= 0: try: news_clicknum = comments['result']['count']['total'] NewsStorage.setclicknum(params.originalurl, news_clicknum) except: Logger.printexception() comments_count = int(comments['result']['count']['show']) #设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return pages = int(math.ceil(float(comments_count - cmtnum) / self.DEFAULT_PAGE_SIE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue url = CommonComments.SINA_COMMENTS_URL.format(channel=channel, newsid=newsid, pn=page, ps=SinaComments.DEFAULT_PAGE_SIE) if group: url = url + '&group=' + group self.storeurl(url, params.originalurl, SinaComments.STEP_COMMENT_NEXT_PAGE)
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is poocgNewsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('^http[s]?://www\.poocg\.com/works/view/(\d+)', params.originalurl)[0] # 取得总件数 comment_count = float(self.r.parse(ur'<p><strong>(\d+)</strong><span>评论</span></p>', params.content)[0]) NewsStorage.setcmtnum(params.originalurl, int(comment_count)) if comment_count == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comment_count: return # 获取页数 page_num = int(math.ceil(float(comment_count - cmtnum) / poocgNewsComments.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 获得url列表 for page in range(1, page_num + 1, 1): url = poocgNewsComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, poocgNewsComments.STEP_3) if NewsStorage.getclicknum(params.originalurl) <= 0: clicknum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>浏览</span></p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, clicknum) if NewsStorage.getfansnum(params.originalurl) <= 0: fansnum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>喜欢</span></p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, fansnum) publishdate = str(self.r.parse(ur'<p.*class="signed">(.*?)</p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) elif params.step == poocgNewsComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") xparser = XPathUtility(params.content) # 取得所有评论 soup = BeautifulSoup(params.content, 'html.parser') comments = soup.select('.p2') nicks = soup.select('.name') # 取得所有评论时间 times = soup.select('.contentbox .time') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 # year = TimeUtility.getcurrentdate()[0:4] # publictime= year + '年' + commenttimes[index].text try: if len(times)>0: publictime = times[index].get_text() curtime = TimeUtility.getuniformtime(publictime) else: curtime = '' except: curtime ='' content = comments[index].get_text() try: nick = str(nicks[index].get_text()) except: nick = 'nickname' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # if URLStorage.storeupdatetime(params.originalurl, tm): # cmti = CommentInfo() # cmti.content = comments[index].get_text() # commentsInfo.append(cmti) # # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) except Exception,e: traceback.print_exc()
def step1(self, params): """获取评论的首页url""" try: comment_source_url = '' if self.r.search('http[s]{0,1}://.*tv\.sohu\.com.*', params.originalurl): #对于电影,电视剧,搜狐手游取topic_source_id页面字段来源不同 if self.r.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml', params.originalurl): topic_source_id = self.r.parse( 'var[\s]*vid[\s]*=[\s]*\"(.+?)\"', params.content) if topic_source_id: topic_source_id = topic_source_id[0] else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return elif self.r.search('^http://my\.tv\.sohu\.com/.*.shtml', params.originalurl): topic_source_id = self.r.parse('\d{1,}', params.originalurl)[-1] topic_source_id = 'bk' + topic_source_id else: topic_source_id = self.r.getid('PLAYLIST_ID', params.content) if not topic_source_id: topic_source_id = self.r.getid('playlistId', params.content) if not topic_source_id: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return topic_source_id = 'vp' + topic_source_id comment_source_url = self.TV_COMMENTS_SOURCE_URL.format( self.tv_client_id, params.originalurl, topic_source_id, self.tv_page_size) else: if self.r.parse('group', params.originalurl): topic_source_id = \ self.r.parse('http[s]{0,1}://.*\.sohu\.com/group-(\d+)\.shtml.*', params.originalurl)[0] comment_source_url = self.COMMENTS_SOURCE_URL.format( self.client_id, self.group_mark + topic_source_id, self.page_size) else: topic_source_id = \ self.r.parse('http[s]{0,1}://.*\.sohu\.com/\d{8}/n(\d+)\.shtml.*', params.originalurl)[0] comment_source_url = self.COMMENTS_SOURCE_URL.format( self.client_id, topic_source_id, self.page_size) self.storeurl(comment_source_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #http://tv.sohu.com/20170831/n600133376.shtml #http://tv.sohu.com/s2015/newslist/?vid=4016103 暂无法取得 #对播放量进行检查,如果xpath没有获取到,使用代码通过api获取 if NewsStorage.getclicknum(params.originalurl) <= 0: if re.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml', params.originalurl): vid = self.r.getid('vid', params.content, split='=') clickurl = self.TVCLICKURL.format(vid=vid) self.storeurl(clickurl, params.originalurl, self.STEP_TVCLICK) elif re.search('^http://tv\.sohu\.com/.*vid=(\d+)', params.originalurl): vid = self.r.parse('^http://tv\.sohu\.com/.*vid=(\d+)', params.originalurl)[0] clickurl = self.TVCLICKURL.format(vid=vid) self.storeurl(clickurl, params.originalurl, self.STEP_TVCLICK) elif re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$', params.originalurl): clickurl = self.MYTVCLICKURL.format( vid=params.originalurl.split('/')[-1].split('.')[0]) self.storeurl(clickurl, params.originalurl, self.STEP_MYTVCLICK) if re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$', params.originalurl): if not params.content: Logger.getlogging().debug("no params.content") if not self.r.search('uploadTime: \'(.*)?\'', params.content): Logger.getlogging().debug("no params.content uploadTime") if self.r.search('uploadTime: \'(.*)?\'', params.content): publishdate = self.r.parse('uploadTime: \'(.*)?\'', params.content)[0] NewsStorage.setpublishdate( params.originalurl, TimeUtility.getuniformtime(publishdate)) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))