def parse_load(self, response): user_info = userinfo.WeiboSpider() request_url = response.request.url p = re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取 db = OracleStore() conn = db.get_connection() sql = "select count(*) from t_user_info where userID='%s'" % self.uid cursor = db.select_operation(conn, sql) count = cursor.fetchone() if not count[0]: #若没有爬取过该uid用户,则爬取用户基本信息 analyzer = Analyzer() total_pq = analyzer.get_html( response.body, 'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if user_property == 'icon_verify_co_v': #该账号为公众账号 public_userinfo_url = analyzer.get_public_userinfohref( total_pq) #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键 else: userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'user_property': user_property }, callback=self.parse_userinfo) db.close_connection(conn, cursor) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item for atuser_inlist in atuser_list: if atuser_inlist != []: for atuser in atuser_inlist: uid_url = "http://s.weibo.com/user/" + quote( quote(str(atuser))) + "&Refer=SUer_box" yield Request(url=uid_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid, 'atuser_nickname': atuser }, callback=self.parse_atuser_uid) else: continue
def parse_load(self, response): request_url = response.request.url p = re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if match: if int(match.group(1)) == 0 and int( match.group(2)) == 1: #进行当前主用户信息的获取(即非@用户和转发用户) is_search = response.meta['is_search'] if not is_search: #没有搜索过该主用户 analyzer = Analyzer() total_pq = analyzer.get_html( response.body, 'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if not user_property == 'icon_verify_co_v': #该账号不为公众账号 userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'is_friend': 0 }, callback=self.parse_userinfo) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_html(response.body, 'script:contains("WB_feed WB_feed_v3")') item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item frc_analyzer = friendcircle_analyzer() #获取@用户uid及基本信息 atuser_set = self.get_atuser_set(atuser_list) # for atuser_alias in atuser_set: # friend_url = frc_analyzer.get_frienduid_url(atuser_alias) # yield Request(url=friend_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'is_friend':1},callback=self.parse_friend_uid) #is_friend=1代表爬取@用户基本信息 #获取转发用户uid及基本信息 for repostuser_alias in item['repost_user']: if repostuser_alias: #repostuser_alias不为空,即有转发用户 friend_url = frc_analyzer.get_frienduid_url(repostuser_alias) yield Request( url=friend_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'is_friend': 2 }, callback=self.parse_friend_uid) #is_friend=2代表爬取转发用户基本信息
def parse_load(self,response): item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'],item['timestamp'] = analyzer.get_time(total_pq) atuser_info,item['repost_user'] = analyzer.get_atuser_repostuser(total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list #item['atuser_uid']= "" yield item
def parse_load(self, response): item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list #item['atuser_uid']= "" yield item
def parse_load(self,response): request_url = response.request.url p=re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if match: if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行当前主用户信息的获取(即非@用户和转发用户) is_search = response.meta['is_search'] if not is_search: #没有搜索过该主用户,则is_search=0 analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if not user_property == 'icon_verify_co_v': #该账号不为公众账号 userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url,cookies=random.choice(COOKIES),meta={'uid':response.meta['uid'],'is_friend':0},callback=self.parse_userinfo) item = WeibospiderItem() #获取用户微博信息及@用户与转发信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_html(response.body,'script:contains("WB_feed WB_feed_v3")') item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'],item['timestamp'] = analyzer.get_time(total_pq) weibo_analyzer = weibocontent_analyzer() item['repost_nums'],item['comment_nums'],item['like_nums'] = weibo_analyzer.get_weibo_relative_args(total_pq) atuser_info,item['repost_user'] = analyzer.get_atuser_repostuser(total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item frc_analyzer = friendcircle_analyzer() #获取@用户uid及基本信息 atuser_set = self.get_atuser_set(atuser_list) for atuser_alias in atuser_set: friend_url = frc_analyzer.get_frienduid_url(atuser_alias) yield Request(url=friend_url,cookies=random.choice(COOKIES),meta={'uid':response.meta['uid'],'is_friend':1},callback=self.parse_friend_uid) #is_friend=1代表爬取@用户基本信息 #获取转发用户uid及基本信息 for repostuser_alias in item['repost_user']: if repostuser_alias: #repostuser_alias不为空,即有转发用户 friend_url = frc_analyzer.get_frienduid_url(repostuser_alias) yield Request(url=friend_url,cookies=random.choice(COOKIES),meta={'uid':response.meta['uid'],'is_friend':2},callback=self.parse_friend_uid) #is_friend=2代表爬取转发用户基本信息
def parse_load(self,response): user_info = userinfo.WeiboSpider() request_url = response.request.url p=re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取 db = OracleStore();conn = db.get_connection() sql = "select count(*) from t_user_info where userID='%s'" % self.uid cursor = db.select_operation(conn,sql);count = cursor.fetchone() if not count[0]: #若没有爬取过该uid用户,则爬取用户基本信息 analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if user_property == 'icon_verify_co_v': #该账号为公众账号 public_userinfo_url = analyzer.get_public_userinfohref(total_pq) #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键 else: userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_userinfo) db.close_connection(conn,cursor) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'],item['timestamp'] = analyzer.get_time(total_pq) atuser_info,item['repost_user'] = analyzer.get_atuser_repostuser(total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item for atuser_inlist in atuser_list: if atuser_inlist != []: for atuser in atuser_inlist: uid_url = "http://s.weibo.com/user/"+quote(quote(str(atuser)))+"&Refer=SUer_box" yield Request(url=uid_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid,'atuser_nickname':atuser},callback=self.parse_atuser_uid) else: continue