def process(self, params): Logger.getlogging().info(params.url) try: if params.step == self.STEP_PAGES: self.step1(params) elif params.step == self.STEP_CMTS: self.step2(params) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except: Logger.printexception
def dmzjvideo_step3(self, params): soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('#comment_list_div > .online_anim_debate_mr') for div in divs: try: updatetime = div.select_one( '.anim_debate_mr_right_title').get_text() content = div.select_one('.anim_debate_mr_right_mr').get_text() CMTStorage.storecmt(params.originalurl, content, TimeUtility.getuniformtime(updatetime), '') except: Logger.printexception()
def getcomments_step1(self, params): bookId = int( self.r.parse('^http://pub\.zongheng\.com/book/(\d+).html$', params.url)[0]) Logger.getlogging().debug(bookId) # commentinfo_url = PubComments.COMMENTS_URL.format(bookId=bookId, pageno=1) commentinfo_url = PubComments.COMMENTS_URL self.storeposturl(commentinfo_url, params.originalurl, PubComments.STEP_2, { 'bookId': bookId, 'pageNum': '1' }, {'bookId': bookId})
def process(self, params): Logger.getlogging().info(params.url) try: # 初始化内部子类对象 self.createobject() # 论坛评论取得 if self.r.match('http://bbs.onlylady.com/.*', params.originalurl): # self.onlyladyBbs.process(params) # bbs获取评论调用共通方法,onlyladyBdsComments已测试通过 CommenComments(self).process(params) except Exception, e: traceback.print_exc()
def process(self, params): try: if params.step is IqiyiComments.STEP_1: self.step1(params) elif params.step == IqiyiComments.STEP_2: self.step2(params) elif params.step == IqiyiComments.STEP_3: self.step3(params) elif params.step == IqiyiComments.STEP_PLAYCOUNT: self.geturlplaycount(params) except: Logger.printexception()
def query(self, info): if self.post_url not in BBSS2PostQuery.post_urllist: Logger.debug('{}:Not in tasking'.format(self.post_url)) return if BBSS2PostQuery.isgbk_posturl(self.post_url): info = Common.trydecode(info) #info = info.decode('gbk').encode('utf-8') BBSS2PostQuery.POST_DATA['srchtxt'] = info self.queryinfo = info self.__storeqeuryurl__(self.post_url, BBSS2PostQuery.S2QUERY_FIRST_PAGE, BBSS2PostQuery.POST_DATA, {'info': info})
def ls(self, host, port, username, pwd, lsPath): list = [] # 实例化SSHClient client = paramiko.SSHClient() # 自动添加策略,保存服务器的主机名和密钥信息 client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # 连接SSH服务端,以用户名和密码进行认证 try: client.connect(host, port, username=username, password=pwd) except Exception, e: Logger.getlogging().error('ssh连接失败{Exception}:{error}'.format(Exception=Exception, error=e)) return list
def step1(self, params): Logger.getlogging().info("Flash8Comments.STEP_1") # 1. 根据输入原始url, 拼出评论首页 docurl = self.r.parse('^http://www\.flash8\.net\/flash\/(\d+)\.shtml', params.originalurl)[0] # 评论首页URL commentinfo_url = 'http://www.flash8.net/newgbook/list_iframe.aspx?nsort=flash&iid={docurl}&page=1'.format( docurl=docurl) # 论坛 self.storeurl(commentinfo_url, params.originalurl, Flash8Comments.STEP_2, {'docurl': docurl})
def step1(self, params): try: newsId = self.r.parse('\d{3,}', params.url)[-1] comment_url = self.COMMENTS_URL.format(self.pageno, self.page_size, newsId) self.storeurl(comment_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE, { 'newsId': newsId, 'pageno': self.pageno }) except: Logger.printexception()
def process(self, params): try: if params.step == self.STEP_PAGES: self.step1(params) elif params.step == self.STEP_2: self.step2(params) elif params.step == self.STEP_3: self.step3(params) elif params.step == self.STEP_4: self.setplayinfo(params) except: Logger.printexception()
def rename(self, host, port, username, pwd, before, after): # 实例化SSHClient client = paramiko.SSHClient() # 自动添加策略,保存服务器的主机名和密钥信息 client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # 连接SSH服务端,以用户名和密码进行认证 try: client.connect(host, port, username=username, password=pwd) # 打开一个Channel并执行命令 client.exec_command('mv ' + before + ' ' + after) except Exception, e: Logger.getlogging().error('ssh连接失败{Exception}:{error}'.format(Exception=Exception, error=e))
def step3(self, params): Logger.getlogging().info("ToutiaoNewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 jsoncontent = json.loads(params.content) if jsoncontent: if len(jsoncontent['data']) == 0: return for item in jsoncontent['data']: content = item['comment']['text'] curtime = TimeUtility.getuniformtime(item['comment']['create_time']) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '')
def step2(self, params): # 获取replies jsonReplies = json.loads(params.content)['data']['replies'] for cmt in jsonReplies: try: cmtContent = cmt['content']['message'] cmtPubDate = cmt['ctime'] cmtUser = '' # 存储评论 CMTStorage.storecmt(params.originalurl, cmtContent, cmtPubDate, cmtUser) except: Logger.printexception()
def sshdownload(host, port, username, pwd, targetFilePath, localPath): Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath)) ssh = SSHConnection(host, port, username, pwd) if ssh.connect(): length = len(targetFilePath.split('/')) fileName = targetFilePath.split('/')[length - 1] ssh.download(targetFilePath, localPath + fileName + '.tmp') ssh.close() FileUtility.move(localPath + fileName + '.tmp', localPath + fileName) return True else: return False
def step3_club(self, params): content = params.content data = content[content.index('{'):content.rindex('}') + 1] data = json.loads(data) for item in data['l']: try: curtime = item['cd'] pcontent = item['nr'] comment = XPathUtility(pcontent).getstring('//p') CMTStorage.storecmt(params.originalurl, comment, curtime, '') except: Logger.printexception()
def step1(self, params): Logger.getlogging().info("Dm5Commnets.STEP_1") # 1. 根据输入原始url, 拼出评论首页 docurl = self.r.parse('^http://www\.dm5\.com/manhua-(.*)/', params.originalurl)[0] # 评论首页URL commentinfo_url = 'http://www.dm5.com/manhua-{docurl}'.format( docurl=docurl) # 论坛 self.storeurl(commentinfo_url, params.originalurl, Dm5Commnets.STEP_2_BBS, {'docurl': docurl})
def process(self, params): try: if params.step == self.STEP_COUNT: self.step0(params) elif params.step == self.STEP_PAGES: self.step1(params) elif params.step == self.STEP_CMTS: self.step2(params) elif params.step == self.STEP_PLAY: self.getclick(params) except: Logger.printexception()
def updatedb(self): items = SpiderDao().getall() if not items: return validdate = TimeUtility.getuniformdatebefore(SpiderConfigure.getinstance().getvalidperiod()) removelist = [] for key in items.keys(): info = URLCommentInfo.fromstring(items[key]) if info.timestamp < validdate: Logger.getlogging().debug(items[key]) removelist.append(key) SpiderDao().remove(removelist)
def getclick(self, params): print params.content.replace('\n', ' ').replace('\r', '') pattern1 = '<click>(\d+)</click>' pattern2 = '<click>(\d+)</click>' if self.r.search(pattern1, params.content): click = self.r.parse(pattern1, params.content)[0] NewsStorage.setclicknum(params.originalurl, int(click)) elif self.r.search(pattern2, params.content): click = self.r.parse(pattern2, params.content)[0] NewsStorage.setclicknum(params.originalurl, int(click)) else: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
def step3_yunqi(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 # 取得所有评论 soup = BeautifulSoup(params.content, 'html5lib') comments = soup.select('#commentList> li') for comment in comments: try: content = comment.select_one('.textBox').get_text() curtime = comment.select_one('.userName > span').get_text() CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def processVideo(self, params): try: if params.step is MofangComments.STEP_1: if not self.r.search('data-flag=\"(.*?)\">', params.content): return cmsid = self.r.parse('data-flag=\"(.*?)\">', params.content)[0] comments_url = MofangComments.COMMENTS_URL % (cmsid, '4') self.storeurl(comments_url, params.originalurl, MofangComments.STEP_2, { 'cmsid': cmsid, 'pagesize': '4' }) elif params.step is MofangComments.STEP_2: comments = json.loads(params.content) pagesize = comments['data']['total'] comments_url = MofangComments.COMMENTS_URL % ( params.customized['cmsid'], pagesize) self.storeurl(comments_url, params.originalurl, MofangComments.STEP_3, { 'cmsid': params.customized['cmsid'], 'pagesize': pagesize }) elif params.step is MofangComments.STEP_3: comments = json.loads(params.content) if params.customized['pagesize'] <> '0': pcontent = [] ptime = [] for key in range(0, int(params.customized['pagesize'])): ptime.append( TimeUtility.getuniformtime2( comments['data']['list'][key]['create_time'])) pcontent.append( comments['data']['list'][key]['html_content']) if ptime <> []: index = 0 comments = [] complete = False for comment in pcontent: cmti = CommentInfo() cmti.content = comment #只判断时间段为新增时间段的情况下,才写入增量list中 if URLStorage.storeupdatetime( params.originalurl, str(ptime[index])): comments.append(cmti) index += 1 else: #更新数据库时间 complete = True break self.commentstorage.store(params.originalurl, comments) except Exception, e: Logger.printexception()
def step1(self, params): if re.search('http://.*\.sohu\.com/', params.originalurl): cmttext = XPathUtility(params.content).getstring('//*[@class="c-num-red"][2]|//*[@id="changyan_parti_unit"]|//*[@class="remark-tit"]') if cmttext: try: cmtnum = re.findall('\d+', cmttext)[0] except: cmtnum = -1 else: cmtnum = -1 #cmtnum = NewsStorage.getcmtnum(params.originalurl) if int(cmtnum) == -1: pass elif int(cmtnum) == 0: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return else: cmttext = XPathUtility(params.content).xpath('//*[@class="prompt-null-w"]') if cmttext: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return liteloadApi = ChangyanComments.liteloadApi commentsApi = ChangyanComments.commentsApi # 取得client_id if re.match('http://\w+\.sohu\.com.*',params.originalurl): client_id = 'cyqemw6s1' elif re.match(r'^http://\w+\.(17173|shouyou|yeyou)\.com/.*',params.originalurl): client_id = 'cyqvqDTV5' elif re.match(r'^http://sports\.le\.com/.*', params.originalurl): client_id = 'cyrJ22d8v' #针对妆点网做特殊处理 elif re.match(r'^http://\w+\.zdface\.com.*', params.originalurl): client_id = 'cyrJOywnM' #http://xx.yzz.cn/xiuba/201609/1017135.shtml elif re.match(r'^http://\w+\.yzz\.cn.*', params.originalurl): client_id = 'cyrtYf3sa' elif re.match(r'^http://\w+\.178\.com.*', params.originalurl): client_id = 'cysrntF12' elif re.match(r'^http://.*\.cyol\.com/.*', params.originalurl): client_id = 'cys3X3zo9' else: client_id = self.r.getid('appid', params.content) topic_url = urllib.quote_plus(params.originalurl) #LITELOAD_URL = 'http://changyan.sohu.com/api/{liteloadApi}/topic/liteload?client_id={client_id}&topic_url={topic_url}&topic_source_id={topic_source_id}' topic_source_id = self.r.getid('sid',params.content) if not topic_source_id: topic_source_id = self.r.getid('data-widget-sid', params.content) comment_url = ChangyanComments.LITELOAD_URL.format(liteloadApi=liteloadApi, client_id=client_id, topic_url=topic_url, topic_source_id=topic_source_id) self.storeurl(comment_url, params.originalurl, ChangyanComments.STEP_2, {'client_id': client_id, 'liteloadApi':liteloadApi, 'topic_url':topic_url, 'commentsApi':commentsApi})
def step2_news(self, params): objectid = params.customized['objectid'] channel = params.customized['channel'] type = params.customized['type'] clienttype = params.customized['clienttype'] key = params.customized['key'] pageno = params.customized['pageno'] content = params.content try: data = content[content.index('{'):content.rindex('}') + 1] except: return Logger.printexception() data = json.loads(data) datalist = data['list'] if not datalist: return timelist = [] for item in datalist: curtime = item['createTime'] content = item['content'] CMTStorage.storecmt(params.originalurl, content, curtime, '') timelist.append(TimeUtility.getuniformtime(curtime)) curcmtnum = data['cnum'] if pageno == 1: NewsStorage.setcmtnum(params.originalurl, curcmtnum) if not self.isnewesttime(params.originalurl, min(timelist)): return #dbcmtnum = CMTStorage.getcount(params.originalurl, True) #pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.news_pagesize)) pages = int(math.ceil(float(curcmtnum) / self.news_pagesize)) if pageno >= self.maxpages or pageno >= pages: return lastcmtid = data['list'][-1]['id'] pageno = pageno + 1 comment_url = self.new_commonurl.format(objectid=objectid, channel=channel, type=type, clienttype=clienttype, key=key, pageno=pageno, lastcmtid=lastcmtid) self.storeurl( comment_url, params.originalurl, self.STEP_COMMENT_EACH_PAGE, { 'objectid': objectid, 'channel': channel, 'type': type, 'clienttype': clienttype, 'key': key, 'pageno': pageno })
def process(self, params): Logger.getlogging().info(params.url) try: # 初始化内部子类对象 self.createobject() # 论坛评论取得 if self.r.match('http://bbs.dm123.cn/.*', params.originalurl): self.dm123Bbs.process(params) # 新闻评论取得 elif self.r.match('http://www.dm123.cn/.*', params.originalurl): self.dm123News.process(params) except Exception, e: traceback.print_exc()
def step3(self, params): Logger.getlogging().info("xinhuaComments.STEP_3") # Step3: 通过Step1设置的urls,得到所有评论,抽取评论 comment_json = json.loads(params.content) for key in range(0, len(comment_json['contentAll'])): curtime = TimeUtility.getuniformtime( comment_json['contentAll'][key]['commentTime']) content = comment_json['contentAll'][key]['content'] nick = comment_json['contentAll'][key]['nickName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): # 1. 根据输入原始url, 获得子域名 field = self.r.parse('^http[s]{0,1}://(\w+)\.baidu\.com.*', params.originalurl)[0] if not field == 'tieba': Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE) return if params.step is BaiduTiebaComments.BAIDU_STEP1: self.getcomments_step1(params) elif params.step == BaiduTiebaComments.BAIDU_TIEBA_EACH_PAGE: self.getpagecomments_step2(params) elif params.step == BaiduTiebaComments.BAIDU_TIEBA_HUIFU_PAGE: self.get_comment_reply_step3(params)
def process(self, params): """调用具体的取url列表步骤""" try: if params.step == DmzjS2Query.S2QUERY_FIRST_PAGE: self.getQuery.step2(params) if params.step == DmzjS2Query.S2QUERY_EACH_PAGE: self.getQuery.pageprocess(params) if params.step == BBSS2PostQuery.S2QUERY_FIRST_PAGE: self.postQuery.step1(params) if params.step == BBSS2PostQuery.S2QUERY_EACH_PAGE: self.postQuery.step2(params) except: Logger.printexception()
def process(self, params): try: if params.step is Kr36Comments.STEP_1: self.step1(params) elif params.step == Kr36Comments.STEP_2: self.step2(params) elif params.step == Kr36Comments.STEP_3: self.step3(params) else: Logger.getlogging().error('proparam.step == {step}'.format(step=params.step)) return except Exception,e: traceback.print_exc()
def step4bbs(self, params): Logger.getlogging().info("Dm5Commnets.STEP_4") id = params.customized['id'] # 获取所有的评论url hrefs = self.r.parse(ur'/tiezi-\d+-p(\d+)/"', params.content) hrefs = list(set(hrefs)) comment_url = Dm5Commnets.COMMENT_URL_PAGE.format(id=id) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_5_BBS) for href in hrefs: comment_url = Dm5Commnets.COMMENT_URL_PAGE_2.format(id=id, page=href) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_5_BBS)
def step1(self, params): """""" key = params.customized['key'] querylist = [] for page in range(1, self.DEFAULT_PAGES + 1): url = self.QUERY_TEMPLATE.format(key=key, page=page) querylist.append(url) if len(querylist) > 0: self.__storeqeuryurllist__(querylist, self.EACH, {'key': key}) else: Logger.getlogging().warning( '{url}:40000 No results'.format(url=params.originalurl))