Пример #1
0
 def parse_user(self, response):
     result = json.loads(response.text)
     if result.get('data').get('userInfo'):
         user_info = result.get('data').get('userInfo')
         user_item = UserItem()
         fild_map = {
             'id': 'id',
             'name': 'screen_name',
             'gender': 'gender',
             'description': 'description',
             'fans_count': 'followers_count',
             'follows_count': 'follow_count',
             'weibos_count': 'statuses_count',
             'verified': 'verified',
             'verified_reason': 'verified_reason'
         }
         for field, attr in fild_map.items():
             user_item[field] = user_info.get(attr)
         yield user_item
         uid = user_info.get('id')
         yield scrapy.Request(self.follow_url.format(uid=uid, page=1),
                              callback=self.parse_follows,
                              meta={
                                  'page': 1,
                                  'uid': 'uid'
                              })
         yield scrapy.Request(self.fan_url.format(uid=uid, page=1),
                              callback=self.parse_fans,
                              meta={
                                  'page': 1,
                                  'uid': 'uid'
                              })
 def parse_user(self, response):
     self.logger.debug(response)
     result = json.loads(response.text)
     if result.get('data').get('userInfo'):
         user_info = result.get('data').get('userInfo')
         user_item = UserItem()
         field_map = {
                 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone',
                 'gender': 'gener', 'description': 'description', 'fans_count': 'followers_count',
                 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified',
                 'verified_reason': 'verified_reason', 'verified_type': 'verified_type',
             }
         for field, attr in field_map.items():
             user_item[field] = user_info.get(attr)
         yield user_item
         uid = user_info.get('id')
         # 关注
         yield Request(self.follow_url.format(uid=uid, page=1), 
                 callback=self.parse_follows, meta={'page': 1, 'uid': uid})
         # 粉丝
         yield Request(self.fans_url.format(uid=uid, page=1), 
                 callback=self.parse_fans, meta={'page': 1, 'uid': uid})
         # 微薄
         yield Request(self.weibo_url.format(uid=uid, page=1), 
                 callback=self.parse_weibos, meta={'page': 1, 'uid': uid})
Пример #3
0
    def parse_user(self, response):

        result = json.loads(response.text)

        if result.get('data') and result.get('data').get('userInfo'):

            user_info = result.get('data').get('userInfo')
            item = UserItem()
            field_map = {
                'id': 'id',
                'name': 'screen_name',
                'gender': 'gender',
                'description': 'description',
                'followers_count': 'followers_count',
                'follow_count': 'follow_count'
            }

            for k, v in field_map.items():
                item[k] = user_info[v]

            yield item

            # #关注
            uid = user_info.get('id')
            yield scrapy.Request(url=self.follower_url.format(uid=uid, page=1),
                                 callback=self.parse_follow,
                                 meta={
                                     'uid': uid,
                                     'page': 1
                                 })

            #微博
            yield scrapy.Request(url=self.weibo_url.format(uid=uid, page=1),
                                 callback=self.parse_weibo,
                                 meta={
                                     'page': 1,
                                     'uid': uid
                                 })
            #粉丝
            yield scrapy.Request(url=self.fans_url.format(uid=uid, since_id=1),
                                 callback=self.parse_fans,
                                 meta={
                                     'uid': uid,
                                     'since_id': 1
                                 })
Пример #4
0
    def parse(self, response):
        # self.logger.debug(response.text)

        # sel = response.xpath('//tr')
        # print('♦️' * 100)
        # print(sel[0])
        # print(sel[0].xpath('/td').extract())
        # print('♦️' * 100)
        stars = response.xpath('//tr')
        for star in stars:
            user_item = UserItem()

            user_item['name'] = star.xpath(
                './/a[@class="nk"]/text()').extract_first()
            user_item['avatar'] = star.xpath(
                './/img[@class="por"]/@src').extract_first()
            user_item['page'] = star.xpath(
                './/a[@class="nk"]/@href').extract_first()
            user_item['influence'] = star.xpath('.//td[2]/text()').re_first(
                '\d+')
            # user_item['influence'] = star.xpath('.//td[2]/text()').re_first('\d+')
            # print('♦️' * 100)
            # print(sel.xpath('.//a[@class="nk"]/text()').extract_first())
            # print('♦️' * 100)
            yield scrapy.Request(url=user_item['page'],
                                 callback=self.parse_user,
                                 meta={'key': user_item})
            # yield user_item
            # print(sel.xpath('//text()').extract())
        # content = response.xpath('//span[@class="ctt"]/text()').extract()
        next = response.xpath(
            '//*[@id="pagelist"]/form/div/a[text()="下页"]/@href').extract_first(
            )
        next_url = response.urljoin(next)
        print(next_url)
        yield scrapy.Request(url=next_url, callback=self.parse)
Пример #5
0
    def parse_user_info(self, response):
        """
        获取用户信息
        """
        try:
            response_data = json.loads(response.text)
        except JSONDecodeError:
            self.log('获取用户信息失败 URL:{}'.format(response.url),
                     level=logging.INFO)
            return
        if response_data['ok'] == 0:
            self.log('获取用户信息失败 URL:{}'.format(response.url),
                     level=logging.INFO)
            return
        user_id = response.meta['user_id']
        user_info = response_data['data']['userInfo']
        user_item = UserItem()
        user_item['_id'] = user_info['id']
        user_item['nick_name'] = user_info['screen_name']
        user_item['brief_introduction'] = user_info['description']
        user_item['fans_num'] = user_info['followers_count']
        user_item['follows_num'] = user_info['follow_count']
        user_item['tweets_num'] = user_info['statuses_count']
        if user_info['gender'] == 'm':
            user_item['gender'] = '男'
        elif user_info['gender'] == 'f':
            user_item['gender'] = '女'
        else:
            user_item['gender'] = '其他'
        user_item['verified'] = user_info['verified']
        if user_info['verified']:
            if user_info['verified_type'] == 1:
                user_item['verified_type'] = '个人认证'
            else:
                user_item['verified_type'] = '企业认证'
            if user_info['verified_type_ext'] == 0:
                user_item['verified_type_ext'] = '黄色V'
            else:
                user_item['verified_type_ext'] = '橙色V'
        user_item['user_level'] = user_info['urank']
        user_item['vip_level'] = user_info['mbrank']
        tabs = response_data['data']['tabsInfo']['tabs']
        if type(tabs) is list:
            containerid = tabs[0]['containerid']
        else:
            containerid = tabs['0']['containerid']
        if 50 < int(user_item['fans_num']):
            if int(user_item['tweets_num']) > 30:
                yield Request(
                    url=
                    'https://m.weibo.cn/api/container/getIndex?containerid={}_-_INFO'
                    .format(containerid),
                    callback=self.parse_further_user_info,
                    meta={
                        'item': user_item,
                        'user_id': user_id
                    })

                tweet_tab = response_data['data']['tabsInfo']['tabs']
                if type(tweet_tab) is list:
                    tweet_tab = tweet_tab[1]
                else:
                    tweet_tab = tweet_tab['1']
                if tweet_tab['title'] == '微博':
                    containerid = tweet_tab['containerid']
                    headers = DEFAULT_REQUEST_HEADERS.copy()
                    headers['Referer'] = "https://m.weibo.cn/u/{}".format(
                        user_id)
                    yield Request(url=self.tweet_url.format(
                        user_id=user_id, containerid=containerid, page=1),
                                  headers=headers,
                                  callback=self.parse_tweet,
                                  meta={
                                      'is_first': True,
                                      'user_id': user_id
                                  })

        yield Request(url=self.follows_url.format(user_id=user_id, page=1),
                      callback=self.parse_follows,
                      meta={'user_id': user_id})

        yield Request(url=self.fans_url.format(user_id=user_id, since_id=1),
                      callback=self.parse_follows,
                      meta={'user_id': user_id})
Пример #6
0
    def parse_user(self, response):
        """
        解析用户信息
        :param response:Response对象
        :return:
        """
        result = json.loads(response.text)
        if result.get('data').get('userinfo'):
            user_info = result.get('data').get('userinfo')
            user_item = UserItem()
            field_map = {
                'id': 'id',
                'name': 'screen_name',
                'avatar': 'profile_image_url',
                'cover': 'cover_image_phone',
                'gender': 'gender',
                'description': 'description',
                'fans_count': 'followers_count',
                'follows_count': 'follow_count',
                'weibos_count': 'statuses_count',
                'verified': 'verified',
                'verified_reason': 'verified_reason',
                'verified_type': 'verified_type'
            }
            for field, attr in field_map.items():
                user_item[field] = user_info.get(attr)
            yield user_item
            # 关注
            uid = user_info.get('id')
            yield Request(self.follow_url.format(uid=uid, page=1),
                          callback=self.parse_follows,
                          meta={
                              'page': 1,
                              'uid': uid
                          })
            # 粉丝
            yield Request(self.fan_url.format(uid=uid, page=1),
                          callback=self.parse_fans,
                          meta={
                              'page': 1,
                              'uid': uid
                          })
            # 微博
            yield Request(self.weibo_url.format(uid=uid, page=1),
                          callback=self.parse_weibos,
                          meta={
                              'page': 1,
                              'uid': uid
                          })

        def parse_follows(self, response):
            """
            解析用户关注
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \
                    result.get('data').get('cards')[-1].get(
                            'card_group'):
                # 解析用户
                follows = result.get('data').get('cards')[-1].get('card_group')
                for follow in follows:
                    if follow.get('user'):
                        uid = follow.get('user').get('id')
                        yield Request(self.user_url.format(uid=uid),
                                      callback=self.parse_user)

                uid = response.meta.get('uid')
                # 关注列表
                user_relation_item = UserRelationItem()
                follows = [{
                    'id': follow.get('user').get('id'),
                    'name': follow.get('user').get('screen_name')
                } for follow in follows]
                user_relation_item['id'] = uid
                user_relation_item['follows'] = follows
                user_relation_item['fans'] = []
                yield user_relation_item
                # 下一页关注
                page = response.meta.get('page') + 1
                yield Request(self.follow_url.format(uid=uid, page=page),
                              callback=self.parse_follows,
                              meta={
                                  'page': page,
                                  'uid': uid
                              })

        def parse_fans(self, response):
            """
            解析用户粉丝
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \
                    result.get('data').get('cards')[-1].get(
                            'card_group'):
                # 解析用户
                fans = result.get('data').get('cards')[-1].get('card_group')
                for fan in fans:
                    if fan.get('user'):
                        uid = fan.get('user').get('id')
                        yield Request(self.user_url.format(uid=uid),
                                      callback=self.parse_user)

                uid = response.meta.get('uid')
                # 粉丝列表
                user_relation_item = UserRelationItem()
                fans = [{
                    'id': fan.get('user').get('id'),
                    'name': fan.get('user').get('screen_name')
                } for fan in fans]
                user_relation_item['id'] = uid
                user_relation_item['fans'] = fans
                user_relation_item['follows'] = []
                yield user_relation_item
                # 下一页粉丝
                page = response.meta.get('page') + 1
                yield Request(self.fan_url.format(uid=uid, page=page),
                              callback=self.parse_fans,
                              meta={
                                  'page': page,
                                  'uid': uid
                              })

        def parse_weibos(self, response):
            """
            解析微博列表
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards'):
                weibos = result.get('data').get('cards')
                for weibo in weibos:
                    mblog = weibo.get('mblog')
                    if mblog:
                        weibo_item = WeiboItem()
                        field_map = {
                            'id': 'id',
                            'attitudes_count': 'attitudes_count',
                            'comments_count': 'comments_count',
                            'reposts_count': 'reposts_count',
                            'picture': 'original_pic',
                            'pictures': 'pics',
                            'created_at': 'created_at',
                            'source': 'source',
                            'text': 'text',
                            'raw_text': 'raw_text',
                            'thumbnail': 'thumbnail_pic',
                        }
                        for field, attr in field_map.items():
                            weibo_item[field] = mblog.get(attr)
                        weibo_item['user'] = response.meta.get('uid')
                        yield weibo_item
                # 下一页微博
                uid = response.meta.get('uid')
                page = response.meta.get('page') + 1
                yield Request(self.weibo_url.format(uid=uid, page=page),
                              callback=self.parse_weibos,
                              meta={
                                  'uid': uid,
                                  'page': page
                              })
Пример #7
0
    def parse_user(self, response):
        """
        解析用户信息
        :param response: Response对象
        """

        self.logger.debug(response)
        result = json.loads(response.text)
        if result.get('data').get('userInfo'):
            user_info = result.get('data').get('userInfo')
            uid = user_info.get('id')

            ## if is garbage, then return
            if str(uid) in self.finish_ID:
                return

            self.logger.info("In uid {}: ongoing: {}".format(
                uid,
                str(uid) in self.ongoing_ID))
            self.logger.info(response)

            if str(uid) not in self.ongoing_ID and self.validify_user(
                    user_info):
                self.logger.info("Create Item !!!\nIn uid {}".format(uid))

                user_item = UserItem()

                field_map = {
                    'id': 'id',
                    'name': 'screen_name',
                    'avatar': 'profile_image_url',
                    'cover': 'cover_image_phone',
                    'gender': 'gender',
                    'description': 'description',
                    'fans_count': 'followers_count',
                    'follows_count': 'follow_count',
                    'weibos_count': 'statuses_count',
                    'verified': 'verified',
                    'verified_reason': 'verified_reason',
                    'verified_type': 'verified_type'
                }

                for field, attr in field_map.items():
                    user_item[field] = user_info.get(attr)

                self.output_info = True

                ## process -> add into queue
                self.ongoing_ID.add(str(uid))
                self.ongoing_collection.insert_one({'oid': str(uid)})

                yield user_item

            if (len(self.ongoing_ID) % 2 == 0 or len(self.finish_ID) % 2
                    == 0) and self.output_info == True:
                self.output_info = False
                self.logger.info('ongoing: {}'.format(len(self.ongoing_ID)))
                self.logger.info('finish: {}'.format(len(self.finish_ID)))

            # 微博
            #yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
            #              meta={'page': 1, 'uid': uid})
            if user_info.get('followers_count') != None and user_info.get(
                    'follow_count') != None and user_info.get(
                        'followers_count') > 3 * user_info.get('follow_count'):
                # 关注
                self.logger.info("parsing followings of {}".format(uid))
                yield Request(self.follow_url.format(uid=uid, page=1),
                              callback=self.parse_follows,
                              meta={
                                  'page': 1,
                                  'uid': uid
                              })
                # 粉丝
                self.logger.info("parsing fans of {}".format(uid))
                yield Request(self.fan_url.format(uid=uid, page=1),
                              callback=self.parse_fans,
                              meta={
                                  'page': 1,
                                  'uid': uid
                              })

            ## end process -> remove from queue
            if str(uid) in self.ongoing_ID:
                self.ongoing_ID.discard(str(uid))
                self.ongoing_collection.delete_many({'oid': str(uid)})
                self.finish_ID.add(str(uid))
                self.finish_collection.insert_one({'fid': str(uid)})