示例#1
0
    def parse_follow(self,response):
        #print '************************ source request url:',response.request.url
        item = WeibospiderItem()
        analyzer = Analyzer()
        total_pq = analyzer.get_followhtml(response.body)
        #item['followuidlist'] = analyzer.get_follow(total_pq) 
        followlist = analyzer.get_follow(total_pq)
        #item['userinfo'] = {} 
        oldflag,stopflag= getinfo.get_followflag(WeiboSpider.filename)

        p = re.compile('.*_page=(\d).*',re.S)
        current_page = p.search(response.request.url).group(1)  #获取当前关注用户列表页页数
        
        if int(current_page) == 1:
            getinfo.set_followflag(WeiboSpider.filename,followlist[0],'False')
            print 'page is equal 1 '
        else:
            print 'page is NOT equal 1'
        
        for follow_uid in followlist[:2]:
            print '%%%%%%%%%%%%%%%%%%%%%%%%%%',follow_uid
            #item['uid'] = follow_uid
            if follow_uid != oldflag:                       #对于已爬uid不进行重复爬取,即增量爬取
                #爬取该uid用户主页微博内容
                if stopflag == 'False':
                    getinfo.set_followflag(WeiboSpider.filename,followlist[0],'True')
                    mainpageurl = 'http://weibo.com/u/'+str(follow_uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = follow_uid
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page+1
                        #当页第一次加载
                        #当页第二次加载
                        #当页第三次加载
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        if int(GetWeibopage.data['pagebar']) == 1 and page == WeiboSpider.page_num-1:    #在最后一页最后一次加载时,获取用户基本信息
                            print 'hhhhhhhhhhhhhhhhhhhh',followlist
                            yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid,'followlist':followlist},callback=self.get_userurl)
                            #continue
                        #yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_thirdload)

                        #firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        #yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_firstload)
                else:
                    break
            else:
                break
 def parse_follow(self,response):
     item = WeibospiderItem()
     analyzer = Analyzer()
     total_pq = analyzer.get_followhtml(response.body)
     item['followuidlist'] = analyzer.get_follow(total_pq) 
     return item