def parse_follow(self,response): #print '************************ source request url:',response.request.url item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_followhtml(response.body) #item['followuidlist'] = analyzer.get_follow(total_pq) followlist = analyzer.get_follow(total_pq) #item['userinfo'] = {} oldflag,stopflag= getinfo.get_followflag(WeiboSpider.filename) p = re.compile('.*_page=(\d).*',re.S) current_page = p.search(response.request.url).group(1) #获取当前关注用户列表页页数 if int(current_page) == 1: getinfo.set_followflag(WeiboSpider.filename,followlist[0],'False') print 'page is equal 1 ' else: print 'page is NOT equal 1' for follow_uid in followlist[:2]: print '%%%%%%%%%%%%%%%%%%%%%%%%%%',follow_uid #item['uid'] = follow_uid if follow_uid != oldflag: #对于已爬uid不进行重复爬取,即增量爬取 #爬取该uid用户主页微博内容 if stopflag == 'False': getinfo.set_followflag(WeiboSpider.filename,followlist[0],'True') mainpageurl = 'http://weibo.com/u/'+str(follow_uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = follow_uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 #当页第一次加载 #当页第二次加载 #当页第三次加载 thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() if int(GetWeibopage.data['pagebar']) == 1 and page == WeiboSpider.page_num-1: #在最后一页最后一次加载时,获取用户基本信息 print 'hhhhhhhhhhhhhhhhhhhh',followlist yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid,'followlist':followlist},callback=self.get_userurl) #continue #yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_thirdload) #firstloadurl = mainpageurl + getweibopage.get_firstloadurl() #yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_firstload) else: break else: break
def parse_follow(self,response): item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_followhtml(response.body) item['followuidlist'] = analyzer.get_follow(total_pq) return item