def parse_user(self, response): """ 获取用户信息 :param response: :return: """ items = WeibospiderItem() response = json.loads(response.text) user_params = { 'id': 'id', 'screen_name': 'screen_name', 'profile_image_url': 'profile_image_url', 'profile_url': 'profile_url', 'verified_reason': 'verified_reason', 'description': 'description', 'followers_count': 'followers_count', 'follow_count': 'follow_count', 'avatar_hd': 'avatar_hd' } for k, v in user_params.items(): items[k] = response['data']['userInfo'][v] yield items yield Request(self.follow_url.format(uid=items.get('id'), page=1), callback=self.parse_follower, meta={ 'uid': items.get('id'), 'page': 1 })
def parse_userinfo(self, response): '''解析非公众账号个人信息 ''' item = WeibospiderItem() analyzer = Analyzer() try: total_pq1 = analyzer.get_html(response.body, 'script:contains("pf_photo")') item['image_urls'] = analyzer.get_userphoto_url( total_pq1) + "?uid=" + str(response.meta['uid']) #item['image_urls'] = None total_pq2 = analyzer.get_html(response.body, 'script:contains("PCD_text_b")') total_pq3 = analyzer.get_html(response.body, 'script:contains("PCD_counter")') if response.meta['is_friend'] == 0: #此时用于获取主用户基本信息,而非朋友圈用户基本信息 item['userinfo'] = analyzer.get_userinfo(total_pq2, total_pq3) elif response.meta['is_friend'] == 1: #此时用于获取@用户基本信息 item['atuser_userinfo'] = analyzer.get_userinfo( total_pq2, total_pq3) else: #此时用于获取转发用户基本信息 item['repostuser_userinfo'] = analyzer.get_userinfo( total_pq2, total_pq3) except Exception, e: item['userinfo'] = {}.fromkeys( ('昵称:'.decode('utf-8'), '所在地:'.decode('utf-8'), '性别:'.decode('utf-8'), '博客:'.decode('utf-8'), '个性域名:'.decode('utf-8'), '简介:'.decode('utf-8'), '生日:'.decode('utf-8'), '注册时间:'.decode('utf-8'), 'follow_num', 'follower_num'), '') item['atuser_userinfo'] = item['userinfo'] item['repostuser_userinfo'] = item['userinfo'] item['image_urls'] = None
def parse_follower(self, response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item #获取二级(粉丝)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str( follower_uid) cursor1 = db.select_operation(conn, sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str( follower_uid) yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_follownum) else: print 'follow_uid existed!', follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str( follower_uid) cursor2 = db.select_operation(conn, sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str( follower_uid) yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_followernum) else: print 'follower_uid existed!', follower_uid yield None conn.close()
def atuser_uid_parser(self, response): item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_html(response.body, 'script:contains("W_face_radius")') uid = friendcircle.get_user_uid(total_pq) self.atuser_dict[response.meta['atuser_nickname']] = uid
def parse_load(self, response): user_info = userinfo.WeiboSpider() request_url = response.request.url p = re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取 db = OracleStore() conn = db.get_connection() sql = "select count(*) from t_user_info where userID='%s'" % self.uid cursor = db.select_operation(conn, sql) count = cursor.fetchone() if not count[0]: #若没有爬取过该uid用户,则爬取用户基本信息 analyzer = Analyzer() total_pq = analyzer.get_html( response.body, 'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if user_property == 'icon_verify_co_v': #该账号为公众账号 public_userinfo_url = analyzer.get_public_userinfohref( total_pq) #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键 else: userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'user_property': user_property }, callback=self.parse_userinfo) db.close_connection(conn, cursor) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item for atuser_inlist in atuser_list: if atuser_inlist != []: for atuser in atuser_inlist: uid_url = "http://s.weibo.com/user/" + quote( quote(str(atuser))) + "&Refer=SUer_box" yield Request(url=uid_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid, 'atuser_nickname': atuser }, callback=self.parse_atuser_uid) else: continue
def parse_keyuser(self, response): item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_html(response.body, 'script:contains("feed_content wbcon")') item['keyword_uid'], item['keyword_alias'], item['keyword_time'], item[ 'keyword_timestamp'] = analyzer.get_keyuser(total_pq) item['keyword'] = response.meta['keyword'] return item
def parse_load(self, response): request_url = response.request.url p = re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if match: if int(match.group(1)) == 0 and int( match.group(2)) == 1: #进行当前主用户信息的获取(即非@用户和转发用户) is_search = response.meta['is_search'] if not is_search: #没有搜索过该主用户 analyzer = Analyzer() total_pq = analyzer.get_html( response.body, 'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if not user_property == 'icon_verify_co_v': #该账号不为公众账号 userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'is_friend': 0 }, callback=self.parse_userinfo) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_html(response.body, 'script:contains("WB_feed WB_feed_v3")') item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item frc_analyzer = friendcircle_analyzer() #获取@用户uid及基本信息 atuser_set = self.get_atuser_set(atuser_list) # for atuser_alias in atuser_set: # friend_url = frc_analyzer.get_frienduid_url(atuser_alias) # yield Request(url=friend_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'is_friend':1},callback=self.parse_friend_uid) #is_friend=1代表爬取@用户基本信息 #获取转发用户uid及基本信息 for repostuser_alias in item['repost_user']: if repostuser_alias: #repostuser_alias不为空,即有转发用户 friend_url = frc_analyzer.get_frienduid_url(repostuser_alias) yield Request( url=friend_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'is_friend': 2 }, callback=self.parse_friend_uid) #is_friend=2代表爬取转发用户基本信息
def parse_thirdload(self, response): item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'] = analyzer.get_time(total_pq) item['atuser'], item['repostuser'] = analyzer.get_atuser_repostuser( total_pq) return item
def parse_load(self, response): item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) yield item
def parse_atuser_uid(self, response): '''解析对应@用户的uid''' item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() atuser_nickname = response.meta['atuser_nickname'] total_pq = analyzer.get_html(response.body, 'script:contains("W_face_radius")') #uid = friendcircle.get_user_uid(total_pq) atuser_uid = friendcircle.get_user_uid2(atuser_nickname, total_pq) self.atuser_dict[atuser_nickname] = atuser_uid
def parse_atuser_uid(self, response): item = WeibospiderItem() analyzer = Analyzer() friendcircle = FriendCircle() item['atuser_nickname'] = response.meta['atuser_nickname'] total_pq = analyzer.get_html(response.body, 'script:contains("W_face_radius")') atuser_uid = friendcircle.get_user_uid2(item['atuser_nickname'], total_pq) item['atuser_uid'] = atuser_uid item['uid'] = response.meta['uid'] yield item
def parse_load(self,response): item = WeibospiderItem() #获取用户微博内容信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_html(response.body,'script:contains("WB_feed WB_feed_v3")') item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'],item['timestamp'] = analyzer.get_time(total_pq) weibo_analyzer = weibocontent_analyzer() item['repost_nums'],item['comment_nums'],item['like_nums'] = weibo_analyzer.get_weibo_relative_args(total_pq) yield item
def parse_userinfo(self, response): item = WeibospiderItem() analyzer = Analyzer() # try: total_pq1 = analyzer.get_html(response.body, 'script:contains("pf_photo")') item['image_urls'] = analyzer.get_userphoto_url(total_pq1) #total_pq2 = analyzer.get_html(response.body,'script:contains("PCD_text_b")') #item['userinfo'] = analyzer.get_userinfo(total_pq2) #except Exception,e: #item['userinfo'] = {}.fromkeys(('昵称:'.decode('utf-8'),'所在地:'.decode('utf-8'),'性别:'.decode('utf-8'),'博客:'.decode('utf-8'),'个性域名:'.decode('utf-8'),'简介:'.decode('utf-8' ),'生日:'.decode('utf-8'),'注册时间:'.decode('utf-8')),' ') #item['image_urls'] = ' ' item['uid'] = response.meta['uid'] return item
def parse_keyword_info(self, response): '''获取搜索结果信息''' item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_html(response.body, 'script:contains("feed_content wbcon")') keyword_analyzer = keyword_info_analyzer() if total_pq is not None: item['keyword_uid'], item['keyword_alias'], item[ 'keyword_content'], item[ 'keyword_publish_time'] = keyword_analyzer.get_keyword_info( total_pq) item['keyword'] = response.meta['keyword'] if item['keyword_uid']: #即此时item['keyword_uid']不为空,有解析内容 yield item
def parse_public_userinfo(self, response): '''解析公众账号个人信息''' item = WeibospiderItem() analyzer = Analyzer() try: total_pq1 = analyzer.get_html(response.body, 'script:contains("pf_photo")') item['image_urls'] = analyzer.get_userphoto_url(total_pq1) total_pq2 = analyzer.get_html(response.body, 'script:contains("PCD_text_b")') item['userinfo'] = analyzer.get_public_userinfo(total_pq2) except Exception, e: item['userinfo'] = {}.fromkeys( ('联系人:'.decode('utf-8'), '电话:'.decode('utf-8'), '>邮箱:'.decode('utf-8'), '友情链接:'.decode('utf-8')), '') item['image_urls'] = None
def parse_follower(self,response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid) cursor1 = db.select_operation(conn,sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follow_page_num,0,-1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follow) else: print 'follow_uid existed!',follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid) cursor2 = db.select_operation(conn,sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follower_page_num,0,-1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follower) else: print 'follower_uid existed!',follower_uid yield None conn.close()
def parse_based_followernum(self, response): item = WeibospiderItem() analyzer = Analyzer() total_follower_pq = analyzer.get_followerhtml(response.body) follower_page_num = analyzer.get_relation_pagenum(total_follower_pq) if follower_page_num != "" and int(follower_page_num) >= 5: for page in range(5, 0, -1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follower) elif follower_page_num == "": follower_url = 'http://weibo.com/%s/fans?page=1' % response.meta[ 'uid'] yield Request(url=follower_url, meta={ 'cookiejar': 1, 'uid': response.meta['uid'] }, callback=self.parse_follower) #yield None else: for page in range(int(follower_page_num), 0, -1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follower)
def _extractCardType9Info(self, card): _p_urls = [] try: pics = card['mblog']['pics'] for _p in pics: _p_urls.append(_p['url']) except Exception as e: pass _item = WeibospiderItem() _item['scheme'] = card['scheme'] _item['mblog_text'] = card['mblog']['text'] _item['created_at'] = card['mblog']['created_at'] _item['user_id'] = card['mblog']['user']['id'] _item['user_screen_name'] = card['mblog']['user']['screen_name'] _item['user_gender'] = card['mblog']['user']['gender'] _item['user_followers_count'] = card['mblog']['user'][ 'followers_count'] _item['attitudes_count'] = card['mblog']['attitudes_count'] _item['comments_count'] = card['mblog']['comments_count'] _item['image_urls'] = _p_urls return _item
def parseblog(self, response): self.log('######################开始解析微博页############################', logging.INFO) bloglist = response.xpath( '//div[@class="WB_cardwrap S_bg2 clearfix"]/div[@action-type="feed_list_item"]' ) for blog in bloglist: wbitem = WeibospiderItem() userName = blog.xpath( './/div[@class="feed_content wbcon"]/a[@class="W_texta W_fb"]/text()' ).extract()[0].strip() self.log('#############正在解析' + userName + '###############', logging.INFO) certifyl = blog.xpath( './/div[@class="feed_content wbcon"]/a[2]/@alt').extract() if (len(certifyl) == 0): certifyStyle = '无' else: certifyStyle = certifyl[0] contentall = blog.xpath( './/div[@class="feed_content wbcon"]/p[@class="comment_txt"][last()]' ).xpath('string(.)').extract()[0].replace('\u200b', '').strip() contentlist = contentall.split('|') if (len(contentlist) > 1): content = contentlist[0] blogLocation = contentlist[1] else: content = contentlist[0] blogLocation = '无' medial = blog.xpath( './/div[@class="feed_content wbcon"]//div[@class="media_box"]') if (len(medial) >= 1): media = '是' else: media = '否' transNum = blog.xpath( './/div[@class="feed_action clearfix"]//a[@action-type="feed_list_forward"]' ).xpath('string(.)').extract()[0].replace('转发', '') commentNum = blog.xpath( './/div[@class="feed_action clearfix"]//a[@action-type="feed_list_comment"]' ).xpath('string(.)').extract()[0].replace('评论', '') likesNum = blog.xpath( './/div[@class="feed_action clearfix"]//a[@action-type="feed_list_like"]' ).xpath('string(.)').extract()[0] link = 'https://' + blog.xpath( './/div[@class="feed_content wbcon"]/a[@class="W_texta W_fb"]/@href' ).extract()[0] issueTime = blog.xpath( './/div[@class="feed_from W_textb"]/a[1]/@title').extract()[0] try: terminal = blog.xpath( './/div[@class="feed_from W_textb"]/a[2]/text()').extract( )[0] except: terminal = '' wbitem['issueTime'] = issueTime wbitem['terminal'] = terminal wbitem['userName'] = userName wbitem['certifyStyle'] = certifyStyle wbitem['content'] = content wbitem['blogLocation'] = blogLocation wbitem['media'] = media wbitem['transNum'] = transNum wbitem['commentNum'] = commentNum wbitem['likesNum'] = likesNum # print(wbitem) # print(link) yield scrapy.Request(url=link, meta={'item': wbitem}, callback=self.personparse)
def parse(self, response): # print("-----------------------微博爬取parse函数---------------------------------------") base_url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D" + str( self.message) + "&page_type=searchall&page=" results = json.loads(response.text, encoding="utf-8") page = response.meta.get("page") keyword = response.meta.get("keyword") # 下一页 next_page = results.get("data").get("cardlistInfo").get("page") if page != next_page: yield scrapy.Request(url=base_url + str(next_page), headers=self.Referer, meta={ "page": next_page, "keyword": keyword }) result = results.get("data").get("cards") # 获取微博 for i in result: card_type = i.get("card_type") show_type = i.get("show_type") # print(str(card_type) + " and " + str(show_type)) # print(j) # 过滤 if show_type == 1 and card_type == 9: # for i in j.get("card_group"): # print(i) reposts_count = i.get("mblog").get("reposts_count") comments_count = i.get("mblog").get("comments_count") attitudes_count = i.get("mblog").get("attitudes_count") # 过滤到评论 转发 喜欢都为0 的微博 if reposts_count and comments_count and attitudes_count: message_id = i.get("mblog").get("id") status_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=0" # 返回微博评论爬取 yield scrapy.Request(url=status_url % (message_id, message_id), callback=self.commentparse, meta={ "keyword": keyword, "message_id": message_id }) title = keyword status_url = "https://m.weibo.cn/status/%s" # response1 = requests.get(status_url%message_id) if i.get("mblog").get("page_info"): content = i.get("mblog").get("page_info").get( "page_title") content1 = i.get("mblog").get("page_info").get( "content1") content2 = i.get("mblog").get("page_info").get( "content2") else: content = "" content1 = "" content2 = "" text = i.get("mblog").get("text").encode(encoding="utf-8") textLength = i.get("mblog").get("textLength") isLongText = i.get("mblog").get("isLongText") create_time = i.get("mblog").get("created_at") spider_time = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") user = i.get("mblog").get("user").get("screen_name") message_url = i.get("scheme") longText = i.get("mblog").get("longText").get( "longTextContent") if isLongText else "" reposts_count = reposts_count comments_count = comments_count attitudes_count = attitudes_count weiboitemloader = WeiBoItemLoader(item=WeibospiderItem()) weiboitemloader.add_value("title", title) weiboitemloader.add_value("message_id", message_id) weiboitemloader.add_value("content", content) weiboitemloader.add_value("content1", content1) weiboitemloader.add_value("content2", content2) weiboitemloader.add_value("text", text) weiboitemloader.add_value("textLength", textLength) weiboitemloader.add_value("create_time", create_time) weiboitemloader.add_value("spider_time", spider_time) weiboitemloader.add_value("user1", user) weiboitemloader.add_value("message_url", message_url) weiboitemloader.add_value("longText1", longText) weiboitemloader.add_value("reposts_count", reposts_count) weiboitemloader.add_value("comments_count", comments_count) weiboitemloader.add_value("attitudes_count", attitudes_count) print(weiboitemloader) yield weiboitemloader.load_item()