Пример #1
0
    def getPrivateUserInfo(self, uid):
        user = ListenUser(Utils.userHost + '/u/' + uid + '/')
        # 若为私有,需要换种访问信息的方式
        user.isPrivate = 1

        # 如下jsonp可用于获取部分信息
        # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
        # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
        queryParams = {}
        nt = datetime.datetime.now()
        currHour = int(datetime.datetime.now().hour)
        verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format(half=('上午' if currHour < 12 else '下午'))
        queryParams['ver'] = verTime
        queryParams['userId'] = uid
        timeStamp = ''.join(str(time.time()).split('.'))[:13]
        queryParams['callback'] = 'jQuery17202552129787287378_' + str(timeStamp)
        queryParams['_'] = str(timeStamp)
        info_url = Utils.urlCreate(Utils.userHost + '/service/GetUserFace.ashx?', queryParams)

        try:
            infoRespond = requests.get(info_url, headers=Utils.headers)
        except Exception:
            self.logger.error('隐私用户信息获取失败: ' + uid)
            return None

        infoRespondStr = infoRespond.text.split('(', maxsplit=1)[1][:-1]
        infoRespondJson = json.loads(infoRespondStr)

        user.name = infoRespondJson['UserName'] if 'UserName' in infoRespondJson else ''
        user.nickName = infoRespondJson['NickName'][1:-1] if 'NickName' in infoRespondJson else ''
        user.signature = infoRespondJson['UserSign'] if 'UserSign' in infoRespondJson else ''
        user.city = infoRespondJson['city'] if 'city' in infoRespondJson else ''
        user.signinLast = infoRespondJson['PunchCount'] if 'PunchCount' in infoRespondJson else ''
        gender = infoRespondJson['Gender'] if 'PunchCount' in infoRespondJson else ''
        if gender == '1' or gender == '0':
            user.gender = '男' if gender == '1' else '女'

        try:
            user.save(self.mysql_session)
        except Exception:
            self.logger.error('存储隐私用户信息失败')
            raise Exception

        return user
Пример #2
0
    def getUserInfo(self, uid, frequentAdd=1, frequentReduce=2):
        full_url = Utils.userHost + '/u/' + uid + '/'
        user = ListenUser(full_url)

        try:
            content = self.session.get(full_url,
                                       headers=Utils.headers,
                                       allow_redirects=False)
        except Exception:
            self.logger.error('获取用户页面失败: ' + full_url)
            return None

        # 有的人将部落设置为隐私,外部不能访问,页面会302转向error
        if content.status_code != 200:
            self.logger.warning(full_url + ': redirect ' +
                                str(content.status_code))
            responseText = urlparse(unquote(
                content.headers['Location'])).query.split('=', maxsplit=1)[1]
            self.logger.warning('提示: ' + responseText)

            if responseText[0:2] == '用户':
                # 若为私有,需要换种访问信息的方式
                # 如下jsonp可用于获取部分信息
                # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
                # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
                queryParams = {}
                nt = datetime.datetime.now()
                currHour = int(datetime.datetime.now().hour)
                verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format(
                    half=('上午' if currHour < 12 else '下午'))
                queryParams['ver'] = verTime
                queryParams['userId'] = uid
                timeStamp = ''.join(str(time.time()).split('.'))[:13]
                queryParams['callback'] = 'jQuery17202552129787287378_' + str(
                    timeStamp)
                queryParams['_'] = str(timeStamp)
                info_url = Utils.urlCreate(
                    Utils.userHost + '/service/GetUserFace.ashx?', queryParams)

                try:
                    infoRespond = requests.get(info_url, headers=Utils.headers)
                except Exception:
                    self.logger.error('隐私用户信息获取失败: ' + uid)
                    return None

                infoRespondStr = infoRespond.text.split('(',
                                                        maxsplit=1)[1][:-1]
                infoRespondJson = json.loads(infoRespondStr)

                user.name = infoRespondJson[
                    'UserName'] if 'UserName' in infoRespondJson else ''
                user.nickName = infoRespondJson['NickName'][
                    1:-1] if 'NickName' in infoRespondJson else ''
                user.signature = infoRespondJson[
                    'UserSign'] if 'UserSign' in infoRespondJson else ''
                user.city = infoRespondJson[
                    'city'] if 'city' in infoRespondJson else ''
                user.signinLast = infoRespondJson[
                    'PunchCount'] if 'PunchCount' in infoRespondJson else ''
                gender = infoRespondJson[
                    'Gender'] if 'PunchCount' in infoRespondJson else ''
                if gender == '1' or gender == '0':
                    user.gender = '男' if gender == '1' else '女'

                try:
                    user.save(self.mysql_session)
                except Exception:
                    self.logger.error('存储隐私用户信息失败')
                    raise Exception

                # 若为私有,则存储进userAll
                self.privateUids += 1
                self.userUids.add(uid)
                if self.tooFrequent > 0:
                    self.tooFrequent -= frequentReduce
                self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent

                self.logger.debug(user)
                return user
            else:
                # 某用户页面访问次数限制
                if self.lastUserVisitInfo[0] == uid:
                    self.lastUserVisitInfo[1] += 1
                else:
                    self.lastUserVisitInfo[0] = uid
                    self.lastUserVisitInfo[1] = 0

                # 满足次数要求的话,就放进优先队列等待下一次重试访问
                if self.lastUserVisitInfo[1] <= self.failedToVisitCountLimit:
                    # 不能存进userAll, 而是放进放进uidsPriority数组等待重新访问
                    self.appendUidPriority(uid)
                else:
                    # 记录失败的访问用户
                    self.failedToVisit.append(uid)
                    # 避免再次访问
                    self.userUids.add(uid)

                # 如果第一次遇到这种情况,最好睡眠时间快速增长,之后缓慢增长
                if self.tooFrequent == 0:
                    self.tooFrequent = 4
                else:
                    self.tooFrequent += frequentAdd

                return None

        # 如果此时的返回不是过于频繁,那么等待时间即可缩小一倍
        if self.tooFrequent > 0:
            self.tooFrequent -= frequentReduce
        self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent

        try:
            soup = BeautifulSoup(content.text, "lxml")
        except Exception:
            self.logger.error('解析用户页面失败: ' + full_url)
            return

        # 统计
        countList = soup.find(attrs={'id': 'LeftCnt_divUserCount'})

        # 处理一些数据
        if countList:
            # 访客数
            viewCount = countList.find(attrs={'id': 'li_viewCount'})
            if viewCount and len(viewCount) != 0:
                user.viewCount = viewCount.string

            # 留言数
            msgCount = countList.find(attrs={'id': 'li_msgCount'})
            if msgCount and len(msgCount) != 0:
                user.msgCount = msgCount.find('a').string

            # 碎碎数
            ingCount = countList.find(attrs={'id': 'li_ingCount'})
            if ingCount and len(ingCount) != 0:
                user.ingCount = ingCount.find('a').string

            # 日志数
            blogCount = countList.find(attrs={'id': 'li_blogCount'})
            if blogCount and len(blogCount) != 0:
                user.blogCount = blogCount.find('a').string

            # 听写数
            listenCount = countList.find(attrs={'id': 'li_listenCount'})
            if listenCount and len(listenCount) != 0:
                user.listenCount = listenCount.find('a').string

            # 口语数
            talkCount = countList.find(attrs={'id': 'li_talkCount'})
            if talkCount and len(talkCount) != 0:
                user.talkCount = talkCount.find('a').string

            # 礼物数
            giftCount = countList.find(attrs={'id': 'li_giftCount'})
            if giftCount and len(giftCount) != 0:
                user.giftCount = giftCount.find('a').string

        # 个人信息
        profileList = soup.find(id='u_profile').find('ul')

        # 继续处理数据
        if profileList:
            for child in profileList.children:

                if child.name != 'li':
                    continue

                text = child.get_text(strip=True)
                if re.compile(r'性别').search(text):
                    user.gender = child.find_all('span')[1].string

                if re.compile(r'城市').search(text):
                    user.city = child.find_all('span')[1].string

                if re.compile(r'昵称').search(text):
                    child.span.replace_with('')
                    user.nickName = child.get_text(strip=True)

                if re.compile(r'签名').search(text):
                    child.span.replace_with('')
                    user.signature = child.get_text(strip=True)

                if re.compile(r'沪龄').search(text):
                    # user.yearLast = child.find_all('span')[1].string
                    user.registDate = child.find_all('span')[1]['title'][5:]

                if re.compile(r'打卡').search(text):
                    child.span.replace_with('')
                    user.signinLast = int(child.get_text(strip=True)[0:-1])

                if re.compile(r'登录').search(text):
                    user.lastSignin = child.find_all('span')[1].string

        # 自我介绍
        selfIntroPre = soup.find(id='user_Profile_span_reportIt')
        selfIntro = None

        if selfIntroPre:
            selfIntro = selfIntroPre.find_previous_sibling()

        if selfIntro and selfIntro.name == 'div':
            user.selfIntroduction = selfIntro.get_text(strip=True)

        # 城市,因为该部分是注释,所以用bs4找不出来就用re了
        cityMatch = re.compile(
            r'<li id="user_Profile_span_city.*?<span>(.*?)</span></li>',
            re.S).search(content.text)
        if cityMatch:
            user.city = cityMatch.group(1)

        # 获取名称
        userNameHtml = soup.find(id='cont_h1')
        userNameHtml.a.replace_with('')
        userNameHtml.span.replace_with('')
        user.name = userNameHtml.get_text(strip=True)[0:-5].strip()

        try:
            user.save(self.mysql_session)
        except Exception:
            self.logger.error('存储用户信息失败')
            raise Exception

        self.userUids.add(uid)
        self.logger.debug(user)
        return user