def step2(self, params): info = Common.urldec(params.customized['info']) soup = BeautifulSoup(params.content, 'html5lib') text_divs = soup.select('.s_r_txt') urllist = [] if text_divs: for item in text_divs: title = item.select_one('h3 > a').get_text() url = item.select_one('h3 > a').get('href') curtime = item.select('p')[-1].get_text().strip() try: if TimeUtility.compareNow( TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log( url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) except: urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def getclick(self, params): pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html' if re.search(pattern, params.originalurl): if self.r.search(pattern, params.originalurl): vid = self.r.parse(pattern, params.originalurl)[0] playcount_url = self.PALYCOUNT_URL.format(vid=vid) self.storeurl(playcount_url, params.originalurl, LeComments.STEP_PALY) if NewsStorage.getpublishdate( params.originalurl) == TimeUtility.getintformtime(0): if self.r.search('https?://sports\.le\.com/video/\d+\.html', params.originalurl): #仅针对体育频道获取发布时间 pubTime = XPathUtility( params.content).getstring('//*[@class="live-vedio-infor"]') publishdate = TimeUtility.getuniformtime(publishdate) NewsStorage.setpublishdate(params.originalurl, publishdate) else: #仅针对综艺频道获取发布时间 title = XPathUtility(params.content).getstring( '//h1[@class="j-video-name video-name"]') if title: if re.search('\d{8}', title): publishdate = re.findall('\d{8}', title)[0] NewsStorage.setpublishdate(params.originalurl, publishdate)
def step3(self, params): """通过评论的url获取评论""" #相对之前的版本,本次更新变动: #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户 #存储的内容增加了 评论发布时间, 用户 try: jsondata = json.loads(params.content) if jsondata['comments']: for comment in jsondata['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport']['nickname'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] while reply: for comment in comment['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport'].get( 'nickname', 'anonymous') if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def analysis(self, line, post=False): param = ProcessParam() js = json.loads(line) param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if post: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content if 'property' in js: for property in js['property']: if not property.has_key('result'): continue if property['property_name'] == u'page_body': param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'page_title': param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'html_time': param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text']) return param
def waibuetl(self): waibubackup = SpiderConfigure.getwaibubaup() if not FileUtility.exists(waibubackup): FileUtility.mkdirs(waibubackup) waibufile = self.etl.getqueryfromdb() if not FileUtility.exists(waibufile): Logger.getlogging().warning( '{waibufile} not generate!'.format(waibufile=waibufile)) return outtime = 0 self.wdownloader.upload(waibufile) continueflag = True while continueflag: downloadfiles = [] while True: Logger.getlogging().info( 'sleeping {sec}s......'.format(sec=self.waitingperiod)) #time.sleep(self.waitingperiod) outtime += self.waitingperiod if self.wdownloader.iscompleted(): continueflag = False break try: downloadfiles = self.wdownloader.download() if downloadfiles: break except: Logger.printexception() if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.wb_analysis(dfile) #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)): #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile)) FileUtility.move(dfile, waibubackup) logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break
def __init__(self): self.database = SpiderDao() suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_FILENAME_SUFFIX) ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) self.difffile = '{path}/{dt}/{file}'.format( path=SpiderConfigure.getinstance().getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH), dt=TimeUtility.getcurrentdate(), file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix, ts=ts))
def seturlinfos(params): id = NewsStorage.getid(params.url) if NewsStorage.exist(params.url): doc = NewsStorage.getdoc(params.url) data = {} #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime( 0)) == TimeUtility.getintformtime(0): data[ SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) data[SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def step3(self, params): jsondata = json.loads(params.content) comments = [] for comment in jsondata: cmti = CommentInfo() curcomtime = int(comment['created']) # 检查是否需要更新当前抓取的评论的最新时间,第一条评论时间就是最新评论时间 if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformdate2(curcomtime)): cmti.content = comment['contents'] comments.append(cmti) # 检查是否有评论回复 if int(comment['comment_reply_total']) > 0: reply = comment['reply'] # 获取所有的评论回复 for num in range(0, int(comment['comment_reply_total']), 1): recmti = CommentInfo() recmti.content = reply[num]['contents'] comments.append(recmti) if len(comments) >= 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def getid(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) return Common.md5( Common.urlenc(url) + Common.urlenc(content) + pubdate + Common.urlenc(user))
def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step3bbs(self, params): Logger.getlogging().info("JoyComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 try: commentsinfo = json.loads(params.content) commentsinfo['result']['mainreplys']['rows'] except: Logger.getlogging().warning( '{url} Errorcode:40000'.format(url=params.originalurl)) Logger.printexception() return # 获取评论 for index in range( 0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1): # 提取时间 # cmti = CommentInfo() content = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['reply']['body']['text'] curtime = TimeUtility.getuniformtime( str(commentsinfo['result']['mainreplys']['rows'][index] ['reply']['reply']['post_time'])) nick = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['user']['name'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def __init__(self): self.factory = SiteFactory() self.conf = SpiderConfigure.getinstance() self.urlbackuppath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate() self.period = int(SpiderConfigure.getinstance().getlastdays())
def __init__(self): self.id = '' # query self.query = SpiderConfigure.getinstance().getquery() # 渠道 self.channel = SpiderConfigure.getinstance().getchannel() # 类型 self.type = '' # URL self.url = '' # 标题 self.title = '' # 正文 / 主贴 self.body = '' # 评论(内容) / 回复(内容) # 评论量 self.cmtnum = -1 # 阅读量 / 播放量 增量 self.clicknum = -1 # 点赞量 self.votenum = -1 # 粉丝量 / 订阅量 self.fansnum = -1 # 发布时间 self.pubtime = TimeUtility.getintformtime(0) # createtime self.createtime = SpiderConfigure.getinstance().starttime()
def storecmt(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) if not CMTStorage.exist(url, content, pubdate, user): Logger.getlogging().debug( 'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'. format(url=url, content=content, pubdate=pubdate, user=user)) id = CMTStorage.getid(url, content, pubdate, user) data = { SQLDAO.SPIDER_TABLE_COMMENTS_ID: id, SQLDAO.SPIDER_TABLE_COMMENTS_URL: url, SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE: pubdate, SQLDAO.SPIDER_TABLE_COMMENTS_USER: user, SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT: content, SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_COMMENTS, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
def flush(): # dump s1 download failed url SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1) SpiderConfigure.getinstance().setquery('') for url in SpiderReport.getinstance().s1urls: Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN) # dump none url got from website for query querynositemap = {} for query in SpiderReport.getinstance().querysitesmap.keys(): querynositemap[query] = 0 for site in SpiderReport.getinstance().querysitesmap[query]: SpiderReport.s2queryurl(query, site, None, True) querynositemap[query] += 1 # for query in SpiderReport.getinstance().querysitesmap.keys(): if query in querynositemap: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum - querynositemap[query], True) else: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum, True) # # report filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_INFO_REPORT_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(filename) FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format( ch='CHANNEL', query='QUERY', type='TYPE', v1='UPLOAD', v2='DOWNLOAD', v3='NO_TEMPLATE', v4='NO_SITE', v5='WITH_CMT', v6='FAILED' )) for key in SpiderReport.getinstance().reportlist.keys(): for type in SpiderReport.getinstance().reportlist[key].keys(): r = SpiderReport.getinstance().reportlist[key][type] FileUtility.writeline(filename, r.tostring()) for key in SpiderReport.getinstance().s2sitereportlist.keys(): for type in SpiderReport.getinstance().s2sitereportlist[key].keys(): r = SpiderReport.getinstance().s2sitereportlist[key][type] FileUtility.writeline(filename, r.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2()) FileUtility.flush() threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_FAILED_THRESHOLD)) rate = SpiderReport.getinstance().totalreport.getsuccess() if rate < threshold: Logger.getlogging().warning('success rate is lower than threshold') param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate), th=Common.float2percent( threshold)) SpiderNotify.notify(param)
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def step2(self, params): keyword = params.customized['keyword'] query = Common.urldec(keyword) jsondata = json.loads(params.content) # 获取分页数 html = jsondata['html'] soup = bs(html, 'html5lib') videoUrlList = [] videoList = soup.select('li.video') for video in videoList: try: videoUrl = 'https:' + video.select_one('a').get('href') videoUrl = videoUrl.split('?')[0] + '/' title = video.select_one('a').get('title') pubtime = video.find(attrs={ 'class': 'so-icon time' }).get_text().strip() if self.compareNow(TimeUtility.getuniformtime(pubtime)): if self.checktitle(query, title): videoUrlList.append(videoUrl) self.__storeurl__(videoUrl, pubtime, SPIDER_S2_WEBSITE_VIDEO) else: Logger.log(videoUrl, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(videoUrl, constant.ERRORCODE_WARNNING_NOMATCHTIME) except: Logger.printexception()
def geturlcomments(self, params): xparser = XPathUtility(params.content) # 取回所有评论 page = params.customized['page'] if page == 1: commentstimes = xparser.getcomments( '//table[position()>1]/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') else: commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') # 设置实际的评论量 for index in range(0, len(commentscontents), 1): curtime = TimeUtility.getuniformtime(commentstimes[index][4:]) # 提取评论内容 content = commentscontents[index].strip() nick = commentsnicks[index].strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def geturlcomments(self, proparam): # soup = BeautifulSoup(proparam.content, 'html5lib') # lis = soup.select('.comment-say') # for li in lis: # content = li.select_one('.des').get_text() # curtime = li.select_one('.time').get_text() # nick = li.select_one('.name replyName').get_text() # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # 取得评论的正则表达式 comments = re.findall(r'content":"(.+?)","paragraph_id"', proparam.content) commentsTime = self.r.parse( r'origin_created":"(\d+)","member_avatarPath"', proparam.content) nicks = self.r.parse(r'"nickname":"(.*?)","is_hot"', proparam.content) # 取得评论 index = 0 for comment in comments: comment = eval('u"' + comment + '"') content = comment.encode('utf-8') curtime = TimeUtility.getuniformtime(commentsTime[index]) nick = eval('u"' + nicks[index] + '"') nick = nick.encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) index = index + 1
def analysis(self, line, method): try: js = json.loads(line) param = ProcessParam() param.crawler_time = TimeUtility.getuniformtime(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if method == constant.REQUEST_TYPE_POST: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content return param except: line = line.replace('\n', '').strip() if not line or line[0] == '#': return Logger.getlogging().debug(line) param = ProcessParam() param.url = line if method == constant.REQUEST_TYPE_POST: js = json.loads(line) param.url = js['url'] param.data = js['data'] param.content = HttpCacher.getcontent(line, method) if param.content is None: return return param
def get_url_id(self, params): """只适用在腾讯视频的部分""" "cid是电视剧\合集\电影,vid单集" CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html' CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}' VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html' VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html' VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}' if self.r.search(CID_PATTERN, params.originalurl): cid = self.r.parse(CID_PATTERN, params.originalurl)[0] url = CID_URL.format(cid=cid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN1, params.originalurl): vid = self.r.parse(VID_PATTERN1, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN2, params.originalurl): vid = self.r.parse(VID_PATTERN2, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #publish_date publish_date = self.r.getid('publish_date', params.content, split=':') if not publish_date: publish_date = XPathUtility(params.content).getstring( '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]' ) publish_date = TimeUtility.getuniformtime(publish_date) if publish_date: NewsStorage.setpublishdate(params.originalurl, publish_date) self.setclick(params)
def step3_ebook(self, params): try: jsoncontent = json.loads(params.content) if not jsoncontent.has_key('data'): return html = jsoncontent['data']['listHtml'] if not html: return soup = BeautifulSoup(html, 'lxml') divs = soup.select('div.cf') if not divs: return for div in divs: # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2) content = div.select('dd > p')[1].get_text() curtime = TimeUtility.getuniformtime( div.select('dd > p')[0].get_text().split('|')[-1]) nick = div.select('dd > p')[0].get_text().split('|')[0] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def step2(self, params): """""" print params.content try: jsondata = json.loads(params.content) comments_total = int(jsondata['comments_total']) comments_data = jsondata['comments'] except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] for comment in comments_data: cmti = CommentInfo() cmti.content = comment['txtcontent'] tm = comment['addtime'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments) self.post_data['p'] = str(int(self.data['p'] + self.page_size)) self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S') self.storeposturl(self.post_url, params.originalurl, self.STEP_2, self.post_data)
def loop(self): # 循环URL,包括S1以及S2 continueflag = True while continueflag: downloadfiles = [] while True: # check time out if self.istimeout(): param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR SpiderNotify.notify(param) continueflag = False break if self.downloader.iscompleted(): continueflag = False break try: downloadfiles = self.downloader.download() self.upload() if len(downloadfiles) > 0: break else: Logger.getlogging().info('sleeping {0}s......'.format( self.waitingperiod)) time.sleep(self.waitingperiod) except: Logger.printexception() for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.processfile(dfile) logstring = 'PROCESSFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if self.istimeout(): param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR SpiderNotify.notify(param) continueflag = False break self.upload()
def dmzjnews_step3(self, params): params.content = params.content[params.content.index('['):params.content.rindex(']') + 1] commentsinfo = json.loads(params.content) for index in range(0, len(commentsinfo), 1): # 提取时间 content = commentsinfo[index]['content'] curtime = TimeUtility.getuniformtime(commentsinfo[index]['create_time']) CMTStorage.storecmt(params.originalurl, content, curtime, '')
def process_book(self, params): try: if params.step == Comments.STEP_1: # 从url中获取拼接评论url的参数 bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$', params.originalurl)[0] # 拼接第一页评论url comments_url = Comments.COMMENTS_URL % (bookId, 1, Comments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Comments.STEP_2, {'bookId': bookId}) #获取第一页评论内容,循环获取全部评论url elif params.step == Comments.STEP_2: bookId = params.customized['bookId'] # 获取评论的Jason返回值 comments = json.loads(params.content) comments_count = int(comments['page']['count']) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论最后更新时间 lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 获取评论页数 page_count = int(comments['page']['pagecount']) if page_count == 0: return if page_count >= self.maxpages: page_count = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_count + 1, 1): commentUrl = Comments.COMMENTS_URL % (bookId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {'bookId': bookId}) #解析评论数据 elif params.step == Comments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['page']['result']: curtime = TimeUtility.getuniformtime( comment['creationDate']) content = comment['summary'] nick = comment['marks']['nikeName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def process(self, params): try: if params.step is AllComments.STEP_1: aid = re.findall("\d+", params.url.split("/")[-1])[0] aid_url = AllComments.AID_URL % (aid) self.storeurl(aid_url, params.originalurl, AllComments.STEP_2, {'aid': aid}) elif params.step is AllComments.STEP_2: cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",', str(params.content))[0] cms_url = AllComments.KEYID_URL % ( cms_id, params.customized['aid'], params.originalurl) self.storeurl(cms_url, params.originalurl, AllComments.STEP_3, { 'aid': params.customized['aid'], 'cmsid': cms_id }) elif params.step is AllComments.STEP_3: comments = json.loads(params.content) sid = comments['data']['_id'] comment_url = AllComments.COMMENTS_URL % ( sid, '1', params.customized['cmsid']) self.storeurl(comment_url, params.originalurl, AllComments.STEP_4, { 'sid': sid, 'page': '1', 'cmsid': params.customized['cmsid'] }) elif params.step is AllComments.STEP_4: comments = json.loads(params.content) try: comment = [] index = 0 for index in range(0, len(comments['data'])): ctime = TimeUtility.getuniformtime2( comments['data'][index]['ctime']) if URLStorage.storeupdatetime(params.originalurl, str(ctime)): cmti = CommentInfo() cmti.content = comments['data'][index]['content'] comment.append(cmti) self.commentstorage.store(params.originalurl, comment) comment_url = AllComments.COMMENTS_URL % ( params.customized['sid'], str(int(params.customized['page']) + 1), params.customized['cmsid']) self.storeurl( comment_url, params.originalurl, AllComments.STEP_4, { 'sid': params.customized['sid'], 'page': str(int(params.customized['page']) + 1), 'cmsid': params.customized['cmsid'] }) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)