def parse_user(self, response): result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = UserItem() fild_map = { 'id': 'id', 'name': 'screen_name', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason' } for field, attr in fild_map.items(): user_item[field] = user_info.get(attr) yield user_item uid = user_info.get('id') yield scrapy.Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={ 'page': 1, 'uid': 'uid' }) yield scrapy.Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={ 'page': 1, 'uid': 'uid' })
def parse_user(self, response): self.logger.debug(response) result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gener', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type', } for field, attr in field_map.items(): user_item[field] = user_info.get(attr) yield user_item uid = user_info.get('id') # 关注 yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={'page': 1, 'uid': uid}) # 粉丝 yield Request(self.fans_url.format(uid=uid, page=1), callback=self.parse_fans, meta={'page': 1, 'uid': uid}) # 微薄 yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, meta={'page': 1, 'uid': uid})
def parse_user(self, response): result = json.loads(response.text) if result.get('data') and result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'gender': 'gender', 'description': 'description', 'followers_count': 'followers_count', 'follow_count': 'follow_count' } for k, v in field_map.items(): item[k] = user_info[v] yield item # #关注 uid = user_info.get('id') yield scrapy.Request(url=self.follower_url.format(uid=uid, page=1), callback=self.parse_follow, meta={ 'uid': uid, 'page': 1 }) #微博 yield scrapy.Request(url=self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibo, meta={ 'page': 1, 'uid': uid }) #粉丝 yield scrapy.Request(url=self.fans_url.format(uid=uid, since_id=1), callback=self.parse_fans, meta={ 'uid': uid, 'since_id': 1 })
def parse(self, response): # self.logger.debug(response.text) # sel = response.xpath('//tr') # print('♦️' * 100) # print(sel[0]) # print(sel[0].xpath('/td').extract()) # print('♦️' * 100) stars = response.xpath('//tr') for star in stars: user_item = UserItem() user_item['name'] = star.xpath( './/a[@class="nk"]/text()').extract_first() user_item['avatar'] = star.xpath( './/img[@class="por"]/@src').extract_first() user_item['page'] = star.xpath( './/a[@class="nk"]/@href').extract_first() user_item['influence'] = star.xpath('.//td[2]/text()').re_first( '\d+') # user_item['influence'] = star.xpath('.//td[2]/text()').re_first('\d+') # print('♦️' * 100) # print(sel.xpath('.//a[@class="nk"]/text()').extract_first()) # print('♦️' * 100) yield scrapy.Request(url=user_item['page'], callback=self.parse_user, meta={'key': user_item}) # yield user_item # print(sel.xpath('//text()').extract()) # content = response.xpath('//span[@class="ctt"]/text()').extract() next = response.xpath( '//*[@id="pagelist"]/form/div/a[text()="下页"]/@href').extract_first( ) next_url = response.urljoin(next) print(next_url) yield scrapy.Request(url=next_url, callback=self.parse)
def parse_user_info(self, response): """ 获取用户信息 """ try: response_data = json.loads(response.text) except JSONDecodeError: self.log('获取用户信息失败 URL:{}'.format(response.url), level=logging.INFO) return if response_data['ok'] == 0: self.log('获取用户信息失败 URL:{}'.format(response.url), level=logging.INFO) return user_id = response.meta['user_id'] user_info = response_data['data']['userInfo'] user_item = UserItem() user_item['_id'] = user_info['id'] user_item['nick_name'] = user_info['screen_name'] user_item['brief_introduction'] = user_info['description'] user_item['fans_num'] = user_info['followers_count'] user_item['follows_num'] = user_info['follow_count'] user_item['tweets_num'] = user_info['statuses_count'] if user_info['gender'] == 'm': user_item['gender'] = '男' elif user_info['gender'] == 'f': user_item['gender'] = '女' else: user_item['gender'] = '其他' user_item['verified'] = user_info['verified'] if user_info['verified']: if user_info['verified_type'] == 1: user_item['verified_type'] = '个人认证' else: user_item['verified_type'] = '企业认证' if user_info['verified_type_ext'] == 0: user_item['verified_type_ext'] = '黄色V' else: user_item['verified_type_ext'] = '橙色V' user_item['user_level'] = user_info['urank'] user_item['vip_level'] = user_info['mbrank'] tabs = response_data['data']['tabsInfo']['tabs'] if type(tabs) is list: containerid = tabs[0]['containerid'] else: containerid = tabs['0']['containerid'] if 50 < int(user_item['fans_num']): if int(user_item['tweets_num']) > 30: yield Request( url= 'https://m.weibo.cn/api/container/getIndex?containerid={}_-_INFO' .format(containerid), callback=self.parse_further_user_info, meta={ 'item': user_item, 'user_id': user_id }) tweet_tab = response_data['data']['tabsInfo']['tabs'] if type(tweet_tab) is list: tweet_tab = tweet_tab[1] else: tweet_tab = tweet_tab['1'] if tweet_tab['title'] == '微博': containerid = tweet_tab['containerid'] headers = DEFAULT_REQUEST_HEADERS.copy() headers['Referer'] = "https://m.weibo.cn/u/{}".format( user_id) yield Request(url=self.tweet_url.format( user_id=user_id, containerid=containerid, page=1), headers=headers, callback=self.parse_tweet, meta={ 'is_first': True, 'user_id': user_id }) yield Request(url=self.follows_url.format(user_id=user_id, page=1), callback=self.parse_follows, meta={'user_id': user_id}) yield Request(url=self.fans_url.format(user_id=user_id, since_id=1), callback=self.parse_follows, meta={'user_id': user_id})
def parse_user(self, response): """ 解析用户信息 :param response:Response对象 :return: """ result = json.loads(response.text) if result.get('data').get('userinfo'): user_info = result.get('data').get('userinfo') user_item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' } for field, attr in field_map.items(): user_item[field] = user_info.get(attr) yield user_item # 关注 uid = user_info.get('id') yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={ 'page': 1, 'uid': uid }) # 粉丝 yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={ 'page': 1, 'uid': uid }) # 微博 yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, meta={ 'page': 1, 'uid': uid }) def parse_follows(self, response): """ 解析用户关注 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ result.get('data').get('cards')[-1].get( 'card_group'): # 解析用户 follows = result.get('data').get('cards')[-1].get('card_group') for follow in follows: if follow.get('user'): uid = follow.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 关注列表 user_relation_item = UserRelationItem() follows = [{ 'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name') } for follow in follows] user_relation_item['id'] = uid user_relation_item['follows'] = follows user_relation_item['fans'] = [] yield user_relation_item # 下一页关注 page = response.meta.get('page') + 1 yield Request(self.follow_url.format(uid=uid, page=page), callback=self.parse_follows, meta={ 'page': page, 'uid': uid }) def parse_fans(self, response): """ 解析用户粉丝 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ result.get('data').get('cards')[-1].get( 'card_group'): # 解析用户 fans = result.get('data').get('cards')[-1].get('card_group') for fan in fans: if fan.get('user'): uid = fan.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 粉丝列表 user_relation_item = UserRelationItem() fans = [{ 'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name') } for fan in fans] user_relation_item['id'] = uid user_relation_item['fans'] = fans user_relation_item['follows'] = [] yield user_relation_item # 下一页粉丝 page = response.meta.get('page') + 1 yield Request(self.fan_url.format(uid=uid, page=page), callback=self.parse_fans, meta={ 'page': page, 'uid': uid }) def parse_weibos(self, response): """ 解析微博列表 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): weibos = result.get('data').get('cards') for weibo in weibos: mblog = weibo.get('mblog') if mblog: weibo_item = WeiboItem() field_map = { 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 'thumbnail': 'thumbnail_pic', } for field, attr in field_map.items(): weibo_item[field] = mblog.get(attr) weibo_item['user'] = response.meta.get('uid') yield weibo_item # 下一页微博 uid = response.meta.get('uid') page = response.meta.get('page') + 1 yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, meta={ 'uid': uid, 'page': page })
def parse_user(self, response): """ 解析用户信息 :param response: Response对象 """ self.logger.debug(response) result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') uid = user_info.get('id') ## if is garbage, then return if str(uid) in self.finish_ID: return self.logger.info("In uid {}: ongoing: {}".format( uid, str(uid) in self.ongoing_ID)) self.logger.info(response) if str(uid) not in self.ongoing_ID and self.validify_user( user_info): self.logger.info("Create Item !!!\nIn uid {}".format(uid)) user_item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' } for field, attr in field_map.items(): user_item[field] = user_info.get(attr) self.output_info = True ## process -> add into queue self.ongoing_ID.add(str(uid)) self.ongoing_collection.insert_one({'oid': str(uid)}) yield user_item if (len(self.ongoing_ID) % 2 == 0 or len(self.finish_ID) % 2 == 0) and self.output_info == True: self.output_info = False self.logger.info('ongoing: {}'.format(len(self.ongoing_ID))) self.logger.info('finish: {}'.format(len(self.finish_ID))) # 微博 #yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, # meta={'page': 1, 'uid': uid}) if user_info.get('followers_count') != None and user_info.get( 'follow_count') != None and user_info.get( 'followers_count') > 3 * user_info.get('follow_count'): # 关注 self.logger.info("parsing followings of {}".format(uid)) yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={ 'page': 1, 'uid': uid }) # 粉丝 self.logger.info("parsing fans of {}".format(uid)) yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={ 'page': 1, 'uid': uid }) ## end process -> remove from queue if str(uid) in self.ongoing_ID: self.ongoing_ID.discard(str(uid)) self.ongoing_collection.delete_many({'oid': str(uid)}) self.finish_ID.add(str(uid)) self.finish_collection.insert_one({'fid': str(uid)})