Пример #1
0
def getInfo3(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应的用户评价具体内容
    try:
        # 当第一次访问页面时,除了获取评论信息,也要获取全部的评论页的总数
        html = NetworkIO().requestHtml(url)
        if html is not None:
            evaluateBlock = html.findall(
                './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]')
            for index, block in enumerate(evaluateBlock):
                uName = block.findtext('.//span[@class="mr10 fl"]').strip()
                evalAtti = block.findtext(
                    './/span[@class="fl colbd mr10"]').strip()
                evalScore = block.findtext(
                    './/span[@class="colClass01 fl"]').strip()
                evalText = block.findtext('.//div[@class="pt5"]').strip()
                evalTime = block.findtext(
                    './/span[@class="colbd f12 db pt10"]').strip()
                dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore,
                          evalText,
                          datetime.strptime(evalTime, '%Y-%m-%d %H:%M:%S'))
                MySQL().saveDoctorEvaluationText(dbInfo)
            # 评价共有多少页
            totalPageInfo = html.find(
                './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]')
            totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip(
            )[1:-3]
            # 目前评价页的索引值
            tmpIndex = url.find('page=') + 5
            currentPageIndex = url[tmpIndex:-6]
            # 获取当前页以后的评论页的评论信息
            if int(currentPageIndex) < int(totalPageInfo):
                for pageIndex in range(
                        int(currentPageIndex) + 1,
                        int(totalPageInfo) + 1):
                    url = url[:int(tmpIndex)] + str(pageIndex) + '#name2'
                    html = NetworkIO().requestHtml(url)
                    if html is not None:
                        evaluateBlock = html.findall(
                            './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]'
                        )
                        for index, block in enumerate(evaluateBlock):
                            uName = block.findtext(
                                './/span[@class="mr10 fl"]').strip()
                            evalAtti = block.findtext(
                                './/span[@class="fl colbd mr10"]').strip()
                            evalScore = block.findtext(
                                './/span[@class="colClass01 fl"]').strip()
                            evalText = block.findtext(
                                './/div[@class="pt5"]').strip()
                            evalTime = block.findtext(
                                './/span[@class="colbd f12 db pt10"]').strip()
                            dbInfo = (url + '#' + str(index), uName, evalAtti,
                                      evalScore, evalText,
                                      datetime.strptime(
                                          evalTime, '%Y-%m-%d %H:%M:%S'))
                            MySQL().saveDoctorEvaluationText(dbInfo)
    except:
        doExpt('url3', url, 'url3')
Пример #2
0
def getQInfo(url, elem):
    sectionBlock = elem.findall('./p[@class="pt10 pb10 lh180 znblue normal-a"]/a')
    # 一级科室
    keshi1 = getPureText(sectionBlock[2].text)
    # 二级科室
    keshi2 = None
    if len(sectionBlock) >= 4:
        keshi2 = getPureText(sectionBlock[3].text)
    qInfoBlock = elem.find('./div/div[@class="User_askcon clearfix pr"]')
    # 问题题目
    qTitle = getPureText(qInfoBlock.find('.//p[@class="fl dib fb"]').text)
    userInfoBlock = qInfoBlock.findall('./div[@class="f12 graydeep Userinfo clearfix pl29"]/span')
    # 提问者姓名
    uName = None
    # 提问者性别
    uSex = None
    # 提问者年龄
    uAge = None
    # 问题发表时间
    qDatetime = None
    if len(userInfoBlock) >= 7:
        uName = getPureText(userInfoBlock[0].text)
        uSex = getPureText(userInfoBlock[2].text)
        uAge = getPureText(userInfoBlock[4].text)
        qDatetime = getPureText(userInfoBlock[6].text)
        qDatetime = (datetime.strptime(qDatetime, '%Y-%m-%d %H:%M:%S') if qDatetime is not None
                     else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S'))
    # 问题内容
    qBodyBlock = qInfoBlock.find('./div/div[@id="qdetailc"]')
    qBody = ''
    for tmpText in qBodyBlock.itertext():
        subText = getPureText(tmpText)
        qBody = qBody + subText if subText is not None else ''
    MySQL().saveQInfo((url, qTitle, qBody, qDatetime, keshi1, keshi2, uName, uSex, uAge))
Пример #3
0
def getInfo2(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应页面总的用户评价相关信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            evaluateScore = html.findtext(
                './/h4[@class="f30 colClass01 fWei tc"]').strip()
            evaluateStat = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}
            evaluateStatBlock = html.findall(
                './/div[@class="HomSptop_Ri fWei f14 mt20 fl"]/span')
            for index, item in enumerate(evaluateStatBlock):
                tmptext = item.text
                evaluateStat[index] = 0 if len(tmptext) == 0 else int(
                    tmptext[tmptext.find('(') + 1:tmptext.find(')')])
            dbInfo = (url, evaluateScore, evaluateStat[0], evaluateStat[1],
                      evaluateStat[2], evaluateStat[3], evaluateStat[4],
                      evaluateStat[5], evaluateStat[6], evaluateStat[7])
            MySQL().saveDoctorEvaluation(dbInfo)
    except:
        doExpt('url2', url, 'url2')
Пример #4
0
def getReply2Info(reply1Id, elems):
    reply2Index = 1
    for reply2InfoBlock in elems:
        # 二级回复中,是哪一方在回复:值为0表示提问者追问,值为1表示医生回复
        whoReply = reply2InfoBlock.find('.//span[@class="fl dib fb Doc_bla"]')
        whoReply = getPureText(whoReply.text if whoReply is not None else None)
        if whoReply is not None:
            if '追问' in whoReply:
                whoReply = 0
            else:
                whoReply = 1
        # 二级回复的内容
        reply2Body = reply2InfoBlock.find('.//*[@class="fl w390"]')
        reply2Body = getPureText(reply2Body.text)
        # 二级回复的时间
        reply2Datetime = reply2InfoBlock.find('./p[@class="tr col99 f12"]/span')
        reply2Datetime = getPureText(reply2Datetime.text)
        reply2Datetime = (datetime.strptime(reply2Datetime, '%Y-%m-%d %H:%M:%S') if reply2Datetime is not None
                          else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S'))
        reply2Id = reply1Id + '_' + str(reply2Index)
        reply2Index += 1
        MySQL().saveReply2Info((reply2Id, reply2Body, whoReply, reply2Datetime))
Пример #5
0
def getReplyInfo(url, elem):
    # 医生回复是否被采纳:0--未采纳;1--采纳
    accepted = elem.find('./div[@class="t9999 questnew_icon Quest_askh2 pa"]')
    accepted = getPureText(accepted.text) if accepted is not None else None
    accepted = 1 if accepted == '最佳答案' else 0
    # reply1Block = None
    if accepted:
        reply1Block = elem.findall('./div[@class="docall clearfix Bestbg"]')
    else:
        reply1Block = elem.findall('./div[@class="docall clearfix "]')
    reply1Index = 1
    for block in reply1Block:
        # 回复医生的个人url
        doctorUrl = block.find('.//a[@class="f14 fb Doc_bla"]')
        doctorUrl = doctorUrl.get('href') if doctorUrl is not None else None
        # 医生回复的具体内容
        reply1Body = block.find('.//div[@class="pt15 f14 graydeep  pl20 pr20"]')
        reply1Body = getAllText(reply1Body) if reply1Body is not None else None
        # 回复的时间
        reply1Datetime = block.find('.//p[@class="col99 tr clearfix pr20"]/span')
        reply1Datetime = getPureText(reply1Datetime.text)
        reply1Datetime = (datetime.strptime(reply1Datetime, '%Y-%m-%d %H:%M:%S') if reply1Datetime is not None
                          else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S'))
        if accepted:
            puIndex = block.find('.//div[@class="clearfix pb10  pl20 pr20 ballc"]//b[@class="gratenum"]')
        else:
            puIndex = block.find('.//div[@class="clearfix pb10  pl20 pr20 ballc pr"]//b[@class="gratenum"]')
        puIndex = getPureText(puIndex.text) if puIndex is not None else None
        reply1Id = url + '#' + str(reply1Index)
        reply1Index += 1
        MySQL().saveReply1Info((reply1Id, doctorUrl, reply1Body, reply1Datetime, puIndex, accepted))

        # 获取二级回复信息
        reply2InfoBlock = block.findall('.//div[@class="appdoc appxian ml20 mr20 mt15 pb10 clearfix f14"]'
                                        '/div[@class="usezw pt10 clearfix"]')
        getReply2Info(reply1Id, reply2InfoBlock)
Пример #6
0
        if rawText == '':
            rawText = None
    return rawText


def doExpt(password, tb, url, logIdentifier):
    if password is not None:
        UrlClient.saveUrl(password, tb, url)
    else:
        Redis().saveUrl(tb, url)  # 与redis在同一台主机上时
    FileIO.handleExpt(traceback.format_exc(), url, logIdentifier)


if __name__ == '__main__':
    tmpPwd = input('请输入用于认证的密码:').strip()
    tmpYear = input('请输入数据所在队列键名:').strip()
    MySQL().createTables()
    print('数据库中相应数据表已准备完成...')
    # threadList = []
    # for i in range(5):
    #     tmpThread = threading.Thread(target=getQPageInfo, args=(tmpYear, None if tmpPwd == '' else tmpPwd))
    #     threadList.append(tmpThread)
    # for tmpThread in threadList:
    #     tmpThread.start()
    # for tmpThread in threadList:
    #     tmpThread.join()
    jobs = []
    for i in range(5):
        jobs.append(gevent.spawn(getQPageInfo, tmpYear, (None if tmpPwd == '' else tmpPwd)))
    gevent.joinall(jobs)
Пример #7
0
def getInfo(url):
    # http://club.xywy.com/familyDoctor/pay/43983196 对应的页面信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            # 医生姓名
            doctorName = html.findtext('.//i[@class="fwei fl"]')
            doctorName = doctorName[:-6] if doctorName is not None and len(
                doctorName) > 6 else None
            # 医生职称和医院科室
            doctorRankAndHosp = html.find('.//div[@class=" lh200 pt10 f14"]')
            doctorRank = doctorRankAndHosp.text
            doctorHosp = doctorRankAndHosp.find('./br')
            # 获取医生的勋章
            medalsBlock = html.findall('.//div[@class="HomePth"]/span')
            medals = ''
            for medal in medalsBlock:
                medals += medal.get('data-th') + ','
            # 医生的寄语
            sendWord = html.find(
                './/div[@class="f12 graydeep club_home_icon HomePj"]/span'
            ).tail
            # 医生的服务类型
            serviceTypeBlock = {
                0: html.find('.//div[@class="fl pr"]'),
                1: None
            }
            if serviceTypeBlock[0] is None:
                serviceTypeBlock[1] = html.find('.//div[@class="fl f14"]')
            serviceTypes = {0: '', 1: ''}
            oldServiceTypes = {0: '', 1: ''}
            if serviceTypeBlock[0] is not None:
                serviceTypeBlock2 = serviceTypeBlock[0].findall('.//a[@cate]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        serviceTypes[index] += text.strip()
            elif serviceTypeBlock[1] is not None:
                # 各服务原始价格
                serviceTypeBlock2 = serviceTypeBlock[1].findall('.//a[@cate]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        serviceTypes[index] += text.strip()
                serviceTypeBlock2 = serviceTypeBlock[1].findall(
                    './/span[@class="f14 col99 ml10"]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        oldServiceTypes[index] += text.strip()
            # 用户评分(放到用户评价界面抓取)
            # evaluateScore = html.findtext('.//span[@class="fl colClass01 fwei"]')
            # 签约家庭和帮助用户
            helpedInfo = {0: None, 1: None}
            helpedInfoBlock = html.findall('.//span[@class="fb f16 ml5"]')
            for index, item in enumerate(helpedInfoBlock):
                helpedInfo[index] = item.text
            # 擅长、简介以及荣誉
            infos = {0: '', 1: '', 2: ''}
            infoBlock = html.findall('.//div[@class="HomeJie f14 fwei pt20"]')
            for item in infoBlock:
                tmp = item.findtext('./h4')
                textblock = item.find('./div')
                tmptext = ''
                for text in textblock.itertext():
                    tmptext += text.strip()
                if '擅长' in tmp:
                    infos[0] = tmptext
                elif '简介' in tmp:
                    infos[1] = tmptext
                elif '荣誉' in tmp:
                    infos[2] = tmptext
            dbInfo = (url, doctorName, doctorRank, doctorHosp.tail, medals,
                      sendWord, serviceTypes[0], serviceTypes[1],
                      oldServiceTypes[0], oldServiceTypes[1], helpedInfo[0],
                      helpedInfo[1], infos[0], infos[1], infos[2])
            MySQL().saveDoctorInfo(dbInfo)
    except:
        doExpt('url1', url, 'url1')
Пример #8
0
def getInfo4(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=2&page=2#name3 对应的服务购买信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            serviceBuyBlock = html.findall('.//div[@class="HomBone fwei f14"]')
            for index, block in enumerate(serviceBuyBlock):
                uName = block.findtext('.//span[@class="w100"]').strip()
                serviceType = 1 if '包月' in block.findtext(
                    './/span[@class="w200 tl"]').strip() else 0
                serviceCount = block.findtext(
                    './/span[@class="w60 tc"]').strip()
                servicePrice = block.findtext(
                    './/span[@class="colClass01 fb w80 tc"]').strip()
                serviceStatus = block.findtext(
                    './/span[@class="club_home_icon HomBsuc"]').strip()
                serviceTime = block.findtext(
                    './/span[@class="col99 ml20 tc"]').strip()
                dbInfo = (url + '#' + str(index), uName, serviceType,
                          serviceCount, servicePrice, serviceStatus,
                          serviceTime)
                MySQL().saveServiceInfo(dbInfo)
            # 评价共有多少页
            totalPageInfo = html.find(
                './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]')
            totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip(
            )[1:-3]
            # 目前评价页的索引值
            tmpIndex = url.find('page=') + 5
            currentPageIndex = url[tmpIndex:-6]
            # 获取当前页以后的评论页的评论信息
            if int(currentPageIndex) < int(totalPageInfo):
                for pageIndex in range(
                        int(currentPageIndex) + 1,
                        int(totalPageInfo) + 1):
                    url = url[:int(tmpIndex)] + str(pageIndex) + '#name3'
                    html = NetworkIO().requestHtml(url)
                    if html is not None:
                        serviceBuyBlock = html.findall(
                            './/div[@class="HomBone fwei f14"]')
                        for index, block in enumerate(serviceBuyBlock):
                            uName = block.findtext(
                                './/span[@class="w100"]').strip()
                            serviceType = 1 if '包月' in block.findtext(
                                './/span[@class="w200 tl"]').strip() else 0
                            serviceCount = block.findtext(
                                './/span[@class="w60 tc"]').strip()
                            servicePrice = block.findtext(
                                './/span[@class="colClass01 fb w80 tc"]'
                            ).strip()
                            serviceStatus = block.findtext(
                                './/span[@class="club_home_icon HomBsuc"]'
                            ).strip()
                            serviceTime = block.findtext(
                                './/span[@class="col99 ml20 tc"]').strip()
                            dbInfo = (url + '#' + str(index), uName,
                                      serviceType, serviceCount, servicePrice,
                                      serviceStatus, serviceTime)
                            MySQL().saveServiceInfo(dbInfo)
    except:
        doExpt('url4', url, 'url4')