示例#1
0
    def process_url_page(self,keshi_name):
        #http://ypk.39.net/search/{0}-p{1}/
        all_page_url = 'http://jbk.39.net/bw/{0}_p0#ps'.format(keshi_name)
        #all_page_url = "http://ypk.39.net/search/all?k=".format(parse.quote(disease))
        header={'User-Agent':ChoiceUAIP().choice_ua()}       
        request = Request(all_page_url,headers=header)
        opener = ChoiceUAIP().choice_proxy()
        response = opener.open(request).read()
        if response is None:pass
        allcontent = response.decode('gb2312','ignore')
        selector=etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 
        #//*[@id="res_tab_1"]/div[11]/a[11] 
        '''
        try:
            urlpage = selector.xpath('//*[@id="res_tab_1"]/div[@class="site-pages"]/a[@class="sp-a"][-1]/@href')
        except Exception as e:
        	pass
        '''
        print(selector)
        urlpage = selector.xpath('//*[@id="res_tab_1"]/div[@class="site-pages"]/a[@class="sp-a"]/@href')[1]
        #if urlpage is None:pass
        print('pages:{}'.format(urlpage))
        all_pages = urlpage.replace('#ps','').split('p')[-1]
        #if urlpage is None:pass

        return all_pages
 def process_disease_cause(self, disease):
     header = {'User-Agent': ChoiceUAIP().choice_ua()}
     request = Request(self.url, headers=header)
     opener = ChoiceUAIP().choice_proxy()
     response = opener.open(request).read()
     print(chardet.detect(response))
     allcontent = response.decode('utf-8', 'ignore')
     #print(allcontent)
     if allcontent is None: pass
     selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
     #//*[@id="content"]/p[4]
     #//*[@id="content"]/p[7]
     #//*[@id="content"]/p[4]
     disease_all_infos = selector.xpath('//*[@id="content"]//text()')
     #print(drugs_info)
     disease_all_info = ' '.join(disease_all_infos)
     #disease_all_info = '++'.join(disease_all_info)
     #disease_all_info = ' '.join(disease_all_info.split()).replace('++','')
     #print(disease_all_info)
     first_split_str = "病因详情 /{}  编辑".format(disease)
     disease_cause_first = disease_all_info.split(first_split_str)[1]
     #print(len(disease_cause_first))
     #print(disease_cause_first)
     sec_split_str = "症状 /{}  编辑".format(disease)
     disease_cause = disease_cause_first.split(sec_split_str)[0]
     #print(disease_cause)
     return disease_cause
 def process_disease_check_detail(self):
     header={'User-Agent':ChoiceUAIP().choice_ua()}    
     request = Request(self.url,headers=header)
     opener = ChoiceUAIP().choice_proxy()
     response = opener.open(request).read()
     if response is None:pass
     allcontent = response.decode('gb2312')
     #print(chardet.detect(response))
     #print(urlopen(Request(url,headers=header)).read().decode('gb2312'))
     selector=etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 
     #/html/body/section/div[3]/div[1]/div[1]/div[2]
     check_url = self.url
     common_check = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/div[@class="checkbox"]/div//text()')
     #print(str(common_check).replace(' ',''))
     common_check = '++'.join(common_check)
     common_check = ' '.join(common_check.split()).replace('++','')
     print(common_check)
     checks = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/div[@class="art-box"]/p//text()')
     checks = ' '.join(checks)
     print(str(checks))
     check_updatetime = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/dl[@class="intro"]/dd[@class="i3"]/span/text()')[0].replace('更新','')
     print(check_updatetime)
     #//*[@id="s_browseCount"]
     browse_count = selector.xpath('//dd[@class="i3"]/span[2]/span/text()')[0]
     print(browse_count)
     #//*[@id="s_collectCount"]
     collect_count = selector.xpath('//dd[@class="i3"]/span[3]/span/text()')[0]
     print(collect_count)
     keys_list = ['check_url','common_check','checks','check_updatetime','browse_count','collect_count']
     vals_list = [check_url,common_check,checks,check_updatetime,browse_count,collect_count]   
     check_dict = dict(zip(keys_list,vals_list))
     print(check_dict)
     return check_dict
示例#4
0
def process_url_page(disease):
    #http://ypk.39.net/search/{0}-p{1}/
    all_page_url = "http://ypk.39.net/search/{}-p1".format(
        parse.quote(disease))
    #all_page_url = "http://ypk.39.net/search/all?k=".format(parse.quote(disease))
    header = {'User-Agent': ChoiceUAIP().choice_ua()}
    request = Request(all_page_url, headers=header)
    opener = ChoiceUAIP().choice_proxy()
    response = opener.open(request).read()
    if response is None: pass
    allcontent = response.decode('gb2312', 'ignore')
    selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
    #/html/body/div[8]/div[2]/div[4]/i
    #/html/body/div[8]/div[2]/div[4]/i
    #/html/body/div[8]/div[2]/span/span[1]/b
    urlpage = selector.xpath(
        '//div[@class="page"]/div[@class="search_right"]/div[@class="search_tips"]/i/text()'
    )[0]
    #pgleft
    #/html/body/div[8]/div[2]/div[4]/i
    #/html/body/div[8]/div[2]/span/span[1]/b
    #urlpage = selector.xpath('//div[@class="page"]//span[@class="pgleft"]/b/text()')
    print('pages:{}'.format(urlpage))
    if urlpage is None: pass
    return int(int(urlpage) / 15 + 2)
示例#5
0
    def process_qa_corpus_detail(self):
        #druginfo_url = durgurl+'manual'
        header = {'User-Agent': ChoiceUAIP().choice_ua()}
        request = Request(self.url, headers=header)
        opener = ChoiceUAIP().choice_proxy()
        response = opener.open(request).read()

        if response is None: pass
        allcontent = response.decode('gb2312', 'ignore')
        if allcontent is None: pass
        selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
        #/html/body/section/div[3]/div[1]/div/div[1]/h4

        #/html/body/section/div[3]/div[1]/div/div[1]/h4
        qa_corpus = selector.xpath(
            '//div[@class="content clearfix"]//div[@class="chi-exp-item "]//text()'
        )
        #print(qa_corpus)
        qa_corpus = '++'.join(qa_corpus)
        #print(''.join(strs.split()).replace('++',''))
        qa_corpus = ' '.join(qa_corpus.split()).replace('++', '')
        #print(qa_corpus)
        #question1 = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-exp-item "]/h4/text()')
        #print(question1)
        return qa_corpus
示例#6
0
    def process_symptom(self):
        header = {'User-Agent': ChoiceUAIP().choice_ua()}
        request = Request(self.symptoms_url, headers=header)
        opener = ChoiceUAIP().choice_proxy()
        response = opener.open(request).read()
        if response is None: pass
        allcontent = response.decode('gb2312')
        #print(chardet.detect(response))
        #print(urlopen(Request(url,headers=header)).read().decode('gb2312'))
        selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
        #/html/body/section/div[3]/div[1]/div[1]/dl[2]/dd[2]/text()
        symptoms_url = self.symptoms_url
        common_symptoms = selector.xpath(
            '//div[@class="content clearfix"]//dl[@class="links"]/dd//text()')
        common_symptoms = '++'.join(common_symptoms)
        common_symptoms_str = ' '.join(common_symptoms.split()).replace(
            '++', '').split('相关症状:')
        print(common_symptoms_str)
        common_symptoms = common_symptoms_str[0]
        links_symptoms = common_symptoms_str[1]
        print(links_symptoms)

        symptoms = selector.xpath(
            '//div[@class="content clearfix"]//div[@class="art-box"]/p//text()'
        )
        symptoms = ' '.join(symptoms)
        print(str(symptoms))

        symptoms_updatetime = selector.xpath(
            '//div[@class="content clearfix"]//dl[@class="intro"]/dd[@class="i3"]/span/text()'
        )[0].replace('更新', '')
        print(symptoms_updatetime)
        #//*[@id="s_browseCount"]
        browse_count = selector.xpath(
            '//dd[@class="i3"]/span[2]/span/text()')[0]
        print(browse_count)
        #//*[@id="s_collectCount"] /html/body/section/div[3]/div[1]/div[1]/dl[1]/dd[2]/span[3]/text()
        collect_count = selector.xpath(
            '//dd[@class="i3"]/span[3]/span/text()')[0]
        print(collect_count)

        keys_list = [
            "symptoms_url", "common_symptoms", "links_symptoms", "symptoms",
            "symptoms_updatetime", "browse_count", "collect_count"
        ]
        vals_list = [
            symptoms_url, common_symptoms, links_symptoms, symptoms,
            symptoms_updatetime, browse_count, collect_count
        ]
        check_dict = dict(zip(keys_list, vals_list))
        print(check_dict)
        return check_dict
示例#7
0
    def process_drugs_overview_detail(self, drugurl):
        '''
        药品概述详细信息
        '''
        print(drugurl)
        viewurl = drugurl.replace('manual', '')
        print(viewurl)
        header = {'User-Agent': ChoiceUAIP().choice_ua()}
        request = Request(viewurl, headers=header)
        opener = ChoiceUAIP().choice_proxy()
        response = opener.open(request).read()
        if response is None: pass
        allcontent = response.decode('gb2312', 'ignore')
        if allcontent is None: pass
        selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
        if selector.xpath(
                '//div[@class="gaisu"]//ul[@class="showlis"]/li[1]/text()'
        ) is None:
            drug_form = selector.xpath(
                '//div[@class="gaisu"]//ul[@class="showlis"]/li[1]/text()')[0]
            print(drug_form)

            #/html/body/div[12]/div[2]/div[1]/div[1]/ul[2]/li[2]/text()
            drug_spec = selector.xpath(
                '//div[@class="gaisu"]//ul[@class="showlis"]/li[2]/text()')[0]
            print(drug_spec)
        else:
            drug_form = "null"
            drug_spec = "null"

        therapeutic_diseases = selector.xpath(
            '//div[@class="gs_right"]/ul[@class="whatsthis clearfix"]/li//text()'
        )
        #therapeutic_diseases = ' '.join(therapeutic_diseases)
        print('治疗常用疾病:{}'.format(therapeutic_diseases))
        key_list = ['drug_form', 'drug_spec', 'therapeutic_diseases']
        val_list = [drug_form, drug_spec, therapeutic_diseases]
        '''
        vals_list = []
        keys_list = []

        keys_list.extend(key_list)
        vals_list.extend(val_list)
        #vals_list.append(drug_form)
        #vals_list.append(drug_spec)
        #vals_list.append(therapeutic_diseases)
        dict_test = dict(zip(keys_list,vals_list))
        print(dict_test)
        '''
        return key_list, val_list
示例#8
0
 def process_drug_url(self):
     header = {'User-Agent': ChoiceUAIP().choice_ua()}
     request = Request(self.url, headers=header)
     opener = ChoiceUAIP().choice_proxy()
     response = opener.open(request).read()
     if response is None: pass
     allcontent = response.decode('gb2312', 'ignore')
     selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
     #/html/body/div[8]/div[2]/ul/li[1]/div[1]/strong/a
     hrefs = selector.xpath('//div[@class="msgs"]/strong/a/@href')
     if hrefs is None: pass
     drughrefs = []
     for i in range(len(hrefs)):
         drughref = 'http://ypk.39.net{}manual'.format(hrefs[i])
         drughrefs.append(drughref)
     return drughrefs
示例#9
0
 def process_disease_wiki(self, disease):
     header = {'User-Agent': ChoiceUAIP().choice_ua()}
     request = Request(self.url, headers=header)
     opener = ChoiceUAIP().choice_proxy()
     response = opener.open(request).read()
     print(chardet.detect(response))
     allcontent = response.decode('utf-8', 'ignore')
     #print(allcontent)
     if allcontent is None: pass
     selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
     #//*[@id="content"]/p[4]
     #//*[@id="content"]/p[7]
     #//*[@id="content"]/p[4]
     disease_wiki_all_infos = selector.xpath('//*[@id="content"]//text()')
     #print(drugs_info)
     disease_wiki_data = ' '.join(disease_wiki_all_infos)
     return disease_wiki_data
示例#10
0
    def process_drugs_manual_detail(self, durgurl):
        druginfo_url = durgurl + 'manual'
        header = {'User-Agent': ChoiceUAIP().choice_ua()}
        request = Request(druginfo_url, headers=header)
        opener = ChoiceUAIP().choice_proxy()
        response = opener.open(request).read()

        if response is None: pass
        allcontent = response.decode('gb2312', 'ignore')
        if allcontent is None: pass
        selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式

        #/html/body/div[9]/div[2]/div[3]/div/dl[1]/dd/p/text()[1]
        drugs_info = selector.xpath('//div[@class="tab_box"]/div//text()')
        #print(drugs_info)
        strs = '++'.join(drugs_info)
        #print(''.join(strs.split()).replace('++',''))
        strs = ' '.join(strs.split()).replace('++', '')
        return strs
 def get_pages(self):
     #//*[@id="anpSelectData_Settings"]/a[13]
     #http://ask.familydoctor.com.cn/jbk/d369?page=0&
     url_page = self.url_start_page + '?page=0&'
     header = {'User-Agent': ChoiceUAIP().choice_ua()}
     request = Request(url_page, headers=header)
     opener = ChoiceUAIP().choice_proxy()
     response = opener.open(request).read()
     print(chardet.detect(response))
     allcontent = response.decode('utf-8', 'ignore')
     #print(allcontent)
     if allcontent is None: pass
     selector = etree.HTML(allcontent)  #将源码转化为能被XPath匹配的格式
     #//*[@id="content"]/p[4]
     #//*[@id="content"]/p[7]
     #//*[@id="content"]/p[4]
     page_str = selector.xpath(
         '//*[@id="anpSelectData_Settings"]/a[13]/@href')[0]
     page_num = page_str.replace('&', '').split('=')[1]
     return page_num