def __upload__(self, filepath): flag = True FileUtility.mkdirs(self.urlbackuppath) FileUtility.copy(filepath, self.urlbackuppath) self.upload_file_list[FileUtility.getfilename(filepath)] = [] # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES: # if self.limpls: # if self.limplsindex >= len(self.limpls): # self.limplsindex = 0 # flag = self.limpls[self.limplsindex].upload(filepath) # self.limplsindex += 1 if filepath.endswith(constant.WEBKIT_FILE_SUFFIX): if self.wimpls: if self.wimplsindoex >= len(self.wimpls): self.wimplsindoex = 0 self.wimpls[self.wimplsindoex].upload(filepath) self.wimplsindoex += 1 elif self.impls: if self.implsindex >= len(self.impls): self.implsindex = 0 flag = self.impls[self.implsindex].upload(filepath) self.implsindex += 1 else: flag = False Logger.getlogging().warning('No taskid or download platform!') return flag
def step2(self, params): try: Logger.getlogging().info("xinhuaComments.STEP_2") # 将STEP_1中的commentinfo_url传下来 newsId = params.customized['newsId'] comments_info = json.loads(params.content) comments_count = comments_info['totalRows'] NewsStorage.setcmtnum(params.originalurl, comments_count) page_count = comments_info['totalPage'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return # 判断增量 if page_count >= self.maxpages: page_count = self.maxpages for index in range(0, int(page_count)): commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format( newsId=newsId, pid=(index + 1)) self.storeurl(commentinfo_url, params.originalurl, xinhuaNewsComments.STEP_3) except: Logger.printexception()
def substep1(self, params, formats): value = self.r.parse(formats, params.url)[0] Logger.getlogging().debug(value) type = value[0] sid = int(value[1]) Logger.getlogging().debug(type) Logger.getlogging().debug(sid) #抓取播放量,youxi无播放量 others = ['video', 'yule'] if type in others: #针对娱乐中的专辑具体分析 if params.originalurl.find('album') > 0: sid = int(self.albumfilter(params)) url = self.CLICK_URL.format(type=type, id1=str(sid)[:3], id2=sid) self.storeurl(url, params.originalurl, KanKanComments.STEP_CLICK, {'sid': sid}) else: Logger.getlogging().warning( '{url} :40000 Sorry, {type} maybe others!'.format( url=params.url, type=type)) #评论中的type转换 type = self.typeconvert(value[0], params.url) commentinfo_url = KanKanComments.COMMENTS_URL2 % (type, sid, 1, self.PERPAGE) Logger.getlogging().debug(commentinfo_url) self.storeurl(commentinfo_url, params.originalurl, KanKanComments.STEP_2, { 'type': type, 'sid': sid })
def process(self, params): try: if params.step is AllComments.STEP_1: try: threadid = self.r.parse('data-thread-key=\"(.*?)\"',params.content)[0]; comments_url = AllComments.COMMENTS_URL % (threadid, 1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':threadid,'pageno':1}) except: return elif params.step is AllComments.STEP_2: try: comments = json.loads(params.content) pagetotal= int(comments['cursor']['pages']) comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) self.storeurl(comments_url, params.originalurl, AllComments.STEP_3, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno'], 'totalpage':pagetotal}) except: return elif params.step is AllComments.STEP_3: try: if params.customized['pageno']<=params.customized['totalpage']: comments = json.loads(params.content) roll=len(comments['response']) ptimer=[] pcontent=[] for key in comments['parentPosts'].keys(): ptime = comments['parentPosts'][key]['created_at'] ptime = ptime.split("+")[0] ptime = ptime.replace("T"," ") ptimer.append(datetime.datetime.strptime(ptime,'%Y-%m-%d %H:%M:%S')) pcontent.append(comments['parentPosts'][key]['message']) for ctime in range(0,len(ptimer)): ptimer[ctime]=datetime.datetime.strptime(str(ptimer[ctime]),'%Y-%m-%d %H:%M:%S') index=0 comments = [] complete = False for comment in pcontent: cmti = CommentInfo() cmti.content = comment if URLStorage.storeupdatetime(params.originalurl, str(ptimer[index])): comments.append(cmti) else: complete = True break; index =index+ 1 self.commentstorage.store(params.originalurl, comments) if not complete: comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'], params.customized['pageno']+1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno']+1, 'totalpage':params.customized['totalpage']}) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def download(self): files = [] if self.completed(): return files Logger.getlogging().debug(self.info.donepath) srclist = self.sshls(self.info.donepath) for donefile in srclist: donefile = donefile.strip() filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.mkdirs(self.info.localdonepath) self.sshdownload(donefile) dfile = self.info.localdonepath + FileUtility.getfilename( donefile) if self.info.jsonpath: dfile = self.bin2json(dfile) files.append(dfile) self.download_time = int(time.time()) self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(dfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=dfile)) break return files
def query(self, info): Logger.getlogging().info("AngeeksS2Query.query") keyvalue = Common.urlenc(info) # step1: 根据key, 拼出下面的url if int(self.querylastdays) <= 7: datevalue = self.WEEKLY elif int(self.querylastdays) <= 30: datevalue = self.MONTHLY else: datevalue = None if datevalue is None: urls = [ AngeeksS2Query.QUERY_TEMPLATE_ALL.format(key=keyvalue, page=0) ] else: urls = [ AngeeksS2Query.QUERY_TEMPLATE.format(key=keyvalue, page=0, date=datevalue) ] Logger.getlogging().debug(urls[0]) self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, { 'query': info, 'date': datevalue })
def get(self, url): saveJson = {} try: Logger.getlogging().debug('Downloading: {url}'.format(url=url)) request = urllib2.Request(url, headers=self.headers) response = urllib2.urlopen(request, timeout=self.timeout) code = response.getcode() info = response.info() # 判断返回的code,如果不是200,则返回空 if code == 200: html = response.read() if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")): html = zlib.decompress(html, 16 + zlib.MAX_WBITS); Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url)) else: Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code)) Logger.getlogging().error('Request Failed: {url}'.format(url=url)) return None except: Logger.getlogging().error('Request Failed: {url}'.format(url=url)) Logger.printexception() return None charset = RegexUtility.getid('charset', html) html = Common.trydecode(html, charset) saveJson['foundin'] = Common.urlenc(url) saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8)) saveJson['crawler_time'] = int(time.time()) jsonStr = json.dumps(saveJson) return jsonStr
def process(self, params): try: if params.step is AllComments.STEP_1: key = int(re.findall("\d+", params.url.split("/")[-1])[0]) comments_url = AllComments.COMMENTS_URL % (key) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'key': key}) elif params.step is AllComments.STEP_2: jsoncontent = self.r.parse('data\((.*?)\)', params.content)[0] comments = json.loads(jsoncontent) pcontent = [] ptime = [] index = 0 for index in range(0, len(comments['comments'])): pcontent.append( comments['comments'][index]['comment_content']) ptime.append(comments['comments'][index]['comment_date']) dataresult = {} for i in range(len(pcontent)): dataresult[ptime[i]] = pcontent[i] comments = [] dataresult = sorted(dataresult.iteritems(), key=lambda dataresult: dataresult[0], reverse=True) for k in range(0, len(dataresult)): if URLStorage.storeupdatetime(params.originalurl, dataresult[k][0]): cmti = CommentInfo() cmti.content = dataresult[k][1] comments.append(cmti) self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def step2_2(self, params): """""" try: jsondata = json.loads(params.content) data = jsondata['data'] soup = BeautifulSoup(data, 'html5lib') divs = soup.select('.comment') except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #comments_total = len(divs) #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] #divs.reverse() for div in divs: cmti = CommentInfo() cmti.content = div.find(attrs={ 'style': re.compile('padding-top') }).get_text().strip() tm = div.select_one('.show-time').get_text() tm = getuniformtime(tm) if not tm: continue if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def step1(self, params): info = params.customized['info'] pages_num = params.customized['pages_num'] soup = BeautifulSoup(params.content,'html5lib') #print soup if soup.find(attrs={"id":re.compile('noresult_part._container')}) and int(pages_num) == 1: Logger.getlogging().warning('{0}:40000 No urllist!'.format(params.url)) return pages = soup.find_all(attrs={'id':re.compile('sogou_page_.*')}) if not pages and int(pages_num) == 1: self.step2(params) return nexted = soup.select_one('#sogou_next') temp = pages_num #重新刷新最新页面 if nexted: pages_num = int(pages[-1].get_text()) elif not soup.find(attrs={"id":re.compile('noresult_part._container')}): pages_num = int(pages[-1].get_text()) if pages_num <= temp: pages_num = temp if pages_num >= self.maxpages: pages_num = self.maxpages querylist = [] #第一页最大为10,以后每次最大值为递增5 maxpage = 10+int(math.ceil(float(pages_num-10)/5))*5 if not nexted or pages_num == self.maxpages or (nexted and pages_num < max(pages_num, 10) ): for page in range(1,pages_num+1): querylist.append(Newstencent.COMMON_URL.format(info=info, page=page)) self.__storeqeuryurllist__(querylist, self.NEWS_EACH) return querylist.append(Newstencent.COMMON_URL.format(info=info, page=pages_num)) self.__storeqeuryurllist__(querylist, self.NEWS_FIRST, {'info': info,'pages_num':pages_num})
def step3(self,params): soup = BeautifulSoup(params.content, 'html5lib') if soup.find(attrs={"id":re.compile('noresult_part._container')}): Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return results = soup.select('.results > .vrwrap') if not results: Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return urllist = [] for item in results: try: if not item.select_one('h3.vrTitle > a'): continue if item.select_one('#hint_container'): continue title = item.select_one('h3.vrTitle > a').get_text() href = item.select_one('h3.vrTitle > a').get('href') timestr = item.select_one('.news-detail > .news-info > .news-from').get_text() times = getuniformtime(timestr) Logger.getlogging().debug('title:'+ title) Logger.getlogging().debug('time:'+ times) if compareNow(times, self.querylastdays): Logger.getlogging().debug('href:'+ href) urllist.append(href) except: Logger.printexception() if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)
def flush(): # dump s1 download failed url SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1) SpiderConfigure.getinstance().setquery('') for url in SpiderReport.getinstance().s1urls: Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN) # dump none url got from website for query querynositemap = {} for query in SpiderReport.getinstance().querysitesmap.keys(): querynositemap[query] = 0 for site in SpiderReport.getinstance().querysitesmap[query]: SpiderReport.s2queryurl(query, site, None, True) querynositemap[query] += 1 # for query in SpiderReport.getinstance().querysitesmap.keys(): if query in querynositemap: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum - querynositemap[query], True) else: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum, True) # # report filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_INFO_REPORT_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(filename) FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format( ch='CHANNEL', query='QUERY', type='TYPE', v1='UPLOAD', v2='DOWNLOAD', v3='NO_TEMPLATE', v4='NO_SITE', v5='WITH_CMT', v6='FAILED' )) for key in SpiderReport.getinstance().reportlist.keys(): for type in SpiderReport.getinstance().reportlist[key].keys(): r = SpiderReport.getinstance().reportlist[key][type] FileUtility.writeline(filename, r.tostring()) for key in SpiderReport.getinstance().s2sitereportlist.keys(): for type in SpiderReport.getinstance().s2sitereportlist[key].keys(): r = SpiderReport.getinstance().s2sitereportlist[key][type] FileUtility.writeline(filename, r.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2()) FileUtility.flush() threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_FAILED_THRESHOLD)) rate = SpiderReport.getinstance().totalreport.getsuccess() if rate < threshold: Logger.getlogging().warning('success rate is lower than threshold') param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate), th=Common.float2percent( threshold)) SpiderNotify.notify(param)
def analysis(self, line, method): try: js = json.loads(line) param = ProcessParam() param.crawler_time = TimeUtility.getuniformtime(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if method == constant.REQUEST_TYPE_POST: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content return param except: line = line.replace('\n', '').strip() if not line or line[0] == '#': return Logger.getlogging().debug(line) param = ProcessParam() param.url = line if method == constant.REQUEST_TYPE_POST: js = json.loads(line) param.url = js['url'] param.data = js['data'] param.content = HttpCacher.getcontent(line, method) if param.content is None: return return param
def step2(self, params): """""" print params.content try: jsondata = json.loads(params.content) comments_total = int(jsondata['comments_total']) comments_data = jsondata['comments'] except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] for comment in comments_data: cmti = CommentInfo() cmti.content = comment['txtcontent'] tm = comment['addtime'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments) self.post_data['p'] = str(int(self.data['p'] + self.page_size)) self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S') self.storeposturl(self.post_url, params.originalurl, self.STEP_2, self.post_data)
def j_step2(self, proparam): Logger.getlogging().info("Comments163.STEP_1_5") #http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/C22OL63405178D8P productKey = proparam.customized['productKey'] docId = proparam.customized['docId'] field = proparam.customized['field'] try: commentsinfo = json.loads(proparam.content) ugcarr = commentsinfo['ugc'].split(',') except: Logger.getlogging().warning('{0}:30000 No comments'.format( proparam.originalurl)) return if len(ugcarr) < 3: if ugcarr[0].strip() == 'comment_bbs': # 没有评论,直接返回 return ugcval = ugcarr[0].split('_') field = ugcval[0].strip() else: field = ugcarr[2].strip() commentinfo_url = 'http://sdk.comment.163.com/api/v1/products/{key}/threads/{docid}'.format( key=productKey, docid=docId) self.storeurl(commentinfo_url, proparam.originalurl, JComments.J_STEP_3, { 'productKey': productKey, 'docId': docId, 'field': field })
def step2bbs(self, params): Logger.getlogging().info("Dm5Commnets.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagenum = 0 xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="inkk ma5"]'): Logger.getlogging().warning('{0}:30001'.format(params.originalurl)) return pageList = xparser.xpath('//*[@id="search_fy"]/a/text()') if not pageList: pagenum = 1 else: pagenum = int(pageList[-2]) for page in range(1, pagenum + 1, 1): comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_3_BBS)
def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def step2(self, params): try: Logger.getlogging().info("Kr36Comments.STEP_2") # 将STEP_1中的cid传下来 cid = params.customized['cid'] jsoncontent = json.loads(params.content) comments_count = jsoncontent['data']['total_items'] page_count = jsoncontent['data']['total_pages'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return #最多只取十页评论 # page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size)) if page_count >= self.maxpages: page_count = self.maxpages lasttime = CMTStorage.getlastpublish(params.originalurl,True) for page in range(1, page_count+1, 1): commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, page) self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_3,lasttime) except: Logger.printexception()
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE: # 获得首页url参数 KEY = params.customized['KEY'] time = params.customized['time'] #获取总页数 xparser = XPathUtility(params.content) pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span') if len(pageCounts) > 0: page = str(pageCounts[0]).split('/')[1] #获取第一页的搜索结果 self.pageprocess(params) if int(page) > 1: if int(page) >= self.maxpages: page = self.maxpages querylist = [] # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url) for pages in range(2, int(page) + 1, 1): url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format( KEY=KEY, pn=pages, time=time) querylist.append(url) self.__storeqeuryurllist__( querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE, {'KEY': KEY}) else: Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' + '相关的帖子') # 从查询页面中获取视频URL elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE: self.pageprocess(params)
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == LeQuery.LETV_S2QUERY_FIRST_PAGE: # 获得首页url参数 q = params.customized['query'] content = json.loads(params.content) count = content['video_count'] if int(count) == 0: Logger.getlogging().info('count:{count}'.format(count=count)) return # 所有循环列表 querylist = [] if count > 510: totalpage = 17 else: totalpage = int( math.ceil(float(count) / LeQuery.DEFAULT_PAGE_SIZE)) # 获取第一页的搜索结果 self.gets2url(params) if totalpage > self.maxpages: totalpage = self.maxpages # 根据总页数,获取query列表(第一页已经获取到了,从第二页开始获取) for page in range(2, totalpage + 1, 1): url = LeQuery.LETV_QUERY_TEMPLATE.format( pn=page, q=params.customized['query']) querylist.append(url) self.__storeqeuryurllist__(querylist, LeQuery.LETV_S2QUERY_EACH_PAGE, {'query': q}) # 从查询页面中获取视频URL elif params.step == LeQuery.LETV_S2QUERY_EACH_PAGE: self.gets2url(params)
def step3bbs(self, params): Logger.getlogging().info("Tmtpostcommnets.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] #for index in range(0, int(len(commentsinfo['data'])), 1): ## 提取时间 #cmti = CommentInfo() #cmti.content = commentsinfo['data'][index]['comment'] #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M') #if URLStorage.storeupdatetime(params.originalurl, tm): #comments.append(cmti) jsondata = commentsinfo['data'] if not jsondata: return for data in jsondata: cmti = CommentInfo() cmti.content = data['comment'] tm = gettimeutil.getuniformtime(data['time_created']) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2(self, params): """""" try: key = params.customized['key'] soup = BeautifulSoup(params.content, 'html5lib') #print soup #searchListOne = soup.select('.searchListOne > ul') searchListOne = soup.select('.searchListOne > ul > li > div') if not searchListOne: Logger.getlogging().warning('{}:40000 No urllist'.format( params.originalurl)) return lis = soup.select( '.searchListOne > ul > li' )[:-1] #最后一个<li id=search_msg style="display:none"></li>,过滤掉 urllist = [] for li in lis: url = li.select_one('h3 > a').get('href') #print '*********',url tm = li.select('.source > span')[0].get_text() tm = getuniformtime(tm) now = getuniformtime(str(time.time())) cmt_num = li.select('.source > span')[-1].get_text() title = li.select_one('h3').get_text() if Common.checktitle(Common.urldec(key), title): if compareNow(tm, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA) except: #traceback.print_exc() Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def pageprocess(self, params): # Step3:根据返回的html,通过xpath://*[@class="scout_anim_titletext"],获得检索结果的标题 # //*[@class="scout_anim_title"]/div/a/@href,获得检索结果的url #Logger.getlogging().debug(params.content) indexstart = params.content.find('(') indexstop = params.content.rfind(')') if indexstart > -1 and indexstop > -1: jsonvalue = params.content[indexstart + 1:indexstop] jsondata = json.loads(jsonvalue) info = params.customized['query'] soup = BeautifulSoup(jsondata['content'], 'html5lib') uls = soup.select('.scout_anim_odd > .scout_anim_odd_ul') if uls: for ul in uls: #titles = ul.select_one('.scout_anim_titletext') titles = ul.select_one('.scout_anim_titletext').get_text() Logger.getlogging().debug(titles) # if info not in titles: if not Common.checktitle(info, titles): return content = ul.select('.scout_anim_content > div > ul > li') if content: if len(content) > 3: content = content[-3:] urllist = [ 'https://donghua.dmzj.com' + item.find('a').get('href') for item in content ] self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" q = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.videobox') if not divs: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return urllist = [] for div in divs: title = div.select_one('.title').get_text() #print title tm = getuniformtime(div.select_one('.date').get_text()) url = div.select_one('.title > a').get('href') Logger.getlogging().debug(title) if not compareNow(tm, self.querylastdays): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.urldec(q), title): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue urllist.append(url) #获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def upload(self, upfile): cmd = self.UPLOADCMD.format(appId=self.appid, token=self.token, times=self.times, path=upfile) if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: Logger.getlogging().debug(cmd) self.jobid = 'test' return True exedata = self.execute(cmd) code = exedata.get('code', 0) if int(code) == 1: self.jobid = exedata['jobId'] return True secs = 5 for count in range(0, self.RETRYTIMES): time.sleep(secs) secs *= 2 exedata = self.execute(cmd) code = exedata.get('code', 0) if int(code) == 1: self.jobid = jsondata['jobId'] return True else: param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format( file=upfile, taskid=self.appid) SpiderNotify.notify(param) return False
def common_step3(self, proparam): # 网易非云阅读处理 try: commentsinfo = json.loads(proparam.content) except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=proparam.originalurl)) return #commentsinfo = json.loads(proparam.content) comments = [] # 获取评论 key_comments = 'comments' if key_comments in commentsinfo: for key in commentsinfo[key_comments].keys(): try: nickname = commentsinfo[key_comments][key]['user'][ 'nickname'] except: nickname = 'anonymous' if CMTStorage.exist( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname): CMTStorage.storecmt( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname) else: break
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is KanKanComments.STEP_1: # 获取播放量(不是所有的视频都有播放量) self.setclicknum(params) if self.r.match(self.TYPE1, params.originalurl): # Step1: 通过原始url得到moveid,得到获取评论的首页url。 movieid = self.r.parse(self.TYPE1, params.url)[0] Logger.getlogging().debug(movieid) commentinfo_url = KanKanComments.COMMENTS_URL1.format( movieid=movieid, page=1, perpage=self.PERPAGE) self.storeurl(commentinfo_url, params.originalurl, KanKanComments.STEP_2, {'movieid': movieid}) elif self.r.match(self.TYPE2, params.originalurl): # Step1: 通过原始url得到type和sid,得到获取评论的首页url self.substep1(params, self.TYPE2) elif self.r.match(self.TYPE3, params.originalurl): # Step1: 通过原始url得到type和sid,vchannel得到获取评论的首页url self.substep1(params, self.TYPE3) elif params.step == KanKanComments.STEP_2: self.step2(params) elif params.step == KanKanComments.STEP_3: self.step3(params) elif params.step == KanKanComments.STEP_CLICK: self.step_click(params) except: Logger.printexception()
def storecmt(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) if not CMTStorage.exist(url, content, pubdate, user): Logger.getlogging().debug( 'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'. format(url=url, content=content, pubdate=pubdate, user=user)) id = CMTStorage.getid(url, content, pubdate, user) data = { SQLDAO.SPIDER_TABLE_COMMENTS_ID: id, SQLDAO.SPIDER_TABLE_COMMENTS_URL: url, SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE: pubdate, SQLDAO.SPIDER_TABLE_COMMENTS_USER: user, SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT: content, SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_COMMENTS, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))