예제 #1
0
 def parse_doctor_info(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name))
     tr_res1 = response.xpath('//div[@class="text"]/table[1]/tbody/tr[position()>9]')
     tr_res2 = response.xpath('//div[@class="text"]/table[2]/tbody/tr[position()>1]')
     dept_name = ''
     try:
         # 获取医生信息
         for each_res in tr_res1:
             loader = CommonLoader2(item=DoctorInfoItem(), response=response)
             loader.add_value('hospital_name', self.hospital_name)
             td_cnt = each_res.xpath('td/span[not(contains(text(),"全天")) '
                                     'and not(contains(text(),"上午")) '
                                     'and not(contains(text(),"下午")) '
                                     'and not(contains(text(),"\u3000"))]/text()')
             length_of_list = len(td_cnt)
             if '名医堂' in td_cnt:
                 loader.add_value('dept_name', td_cnt.extract()[0])
                 loader.add_value('doctor_name', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 3:
                 dept_name = td_cnt.extract()[0].replace(' ', '')
                 loader.add_value('dept_name', dept_name)
                 loader.add_value('doctor_name', td_cnt.extract()[1])
                 loader.add_value('doctor_level', td_cnt.extract()[2])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 2:
                 loader.add_value('dept_name', dept_name)
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('doctor_level', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             else:
                 pass
         for each_res in tr_res2:
             loader = CommonLoader2(item=DoctorInfoItem(), response=response)
             loader.add_value('hospital_name', self.hospital_name)
             td_cnt = each_res.xpath('td[position()<4]/span/text()')
             length_of_list = len(td_cnt)
             if '名医堂' in td_cnt:
                 loader.add_value('dept_name', td_cnt.extract()[1])
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 3:
                 loader.add_value('dept_name', td_cnt.extract()[2])
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('doctor_level', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             else:
                 pass
     except Exception as e:
         self.logger.error('>>>>>>抓取过程中出错了,原因是:{}>>>>>>'.format(repr(e)))
예제 #2
0
 def parse_doctor_info_detail(self, response):
     hospital_name = response.meta.get('hospital_name')
     self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
     try:
         doctor_name = response.meta.get('doctor_name')
         dept_name = response.meta.get('dept_name')
         doctor_level = response.meta.get('doctor_level')
         doc_gt1 = remove_tags(''.join(
             response.xpath('//div[@class="intro_more"]').extract()))
         doc_gt2 = response.xpath(
             '//dd[contains(text(),"擅长领域")]/text()').extract_first('')
         doctor_good_at = doc_gt1.replace('[关闭]',
                                          '') if doc_gt1 else doc_gt2
         loader = CommonLoader2(item=DoctorInfoItem(), response=response)
         loader.add_value('doctor_name', doctor_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('dept_name', dept_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('hospital_name', hospital_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('doctor_level', doctor_level,
                          MapCompose(custom_remove_tags))
         loader.add_xpath('doctor_intro', '//div[@class="hos-guide-box1"]',
                          MapCompose(remove_tags, custom_remove_tags))
         loader.add_value(
             'doctor_goodAt', doctor_good_at,
             MapCompose(custom_remove_tags, match_special, clean_info2))
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('crawled_url', response.url)
         loader.add_value('update_time', now_day())
         doctor_item = loader.load_item()
         yield doctor_item
     except Exception as e:
         self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #3
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     doctor_name = response.meta['doctor_name']
     dept_name = response.meta['dept_name']
     doctor_level = response.meta['doctor_level']
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name)
     loader.add_value('dept_name', dept_name,
                      MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('doctor_level', doctor_level,
                      MapCompose(custom_remove_tags, match_special))
     loader.add_xpath('doctor_intro', '//div[@id="about-right-b"]/p',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_xpath('doctor_goodAt', '//div[@id="about-right-b"]/p',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 医生排班信息
     params = re.search(r'.*\?(.*?)$', response.url)
     reg_url = 'http://www.scpz120.com/ajax/Doctor.aspx?'
     if params:
         reg_link = '{0}{1}'.format(reg_url,
                                    params.group(1).replace('&id', '&kid'))
         reg_request = Request(reg_link,
                               headers=self.headers,
                               callback=self.parse_doctor_reg_info,
                               meta={
                                   'doctor_name': doctor_name,
                                   'dept_name': dept_name
                               })
         self.headers['Referer'] = response.url
         yield reg_request
예제 #4
0
    def parse_doctor_info(self, response):
        # dept_id = response.meta['dept_id']
        doctor_info = json.loads(response.text)
        # page_index = doctor_info[0].get('pageIndex', '1')
        # total_page = doctor_info[0].get('totalpage', 1)
        for each_doctor in doctor_info[0]['data']:
            doctor_name = each_doctor.get('doctorName', '')
            loader = CommonLoader(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name)
            loader.add_value('dept_name', each_doctor.get('deptName', ''))
            loader.add_value('hospital_name', each_doctor.get('hospitalName', ''))
            loader.add_value('doctor_level', each_doctor.get('degree', ''))
            loader.add_value('doctor_goodAt', each_doctor.get('extexperts', ''))
            doctor_id = each_doctor.get('doctorId', '')
            # 获取医生排班信息以及医生详细信息
            if doctor_id:
                self.headers['Referer'] = 'http://www.scgh114.com/web/hospital/doctorinfoP'
                # 医生详细信息
                doctor_detail_request = FormRequest(self.doctor_detail_link,
                                                    headers=self.headers,
                                                    callback=self.parse_doctor_detail,
                                                    formdata={'doctorId': str(doctor_id)},
                                                    meta={'loader': loader},
                                                    dont_filter=True)
                yield doctor_detail_request
                # 医生排班信息
                doctor_reg_request = FormRequest(self.doctor_reg_info_lik,
                                                 headers=self.headers,
                                                 callback=self.parse_doctor_reg_info,
                                                 formdata={'doctorId': str(doctor_id)},
                                                 meta={'doctor_name': doctor_name},
                                                 dont_filter=True)

                yield doctor_reg_request
예제 #5
0
 def parse_doctor_info_detail(self, response):
     hospital_name = response.meta.get('hospital_name')
     self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
     dept_name = response.meta.get('dept_name')
     doctor_level = response.meta.get('doctor_level')
     doctor_name = response.meta.get('doctor_name')
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name,
                      MapCompose(custom_remove_tags))
     # loader.add_xpath('doctor_name', '//span[@class="info-name"]/text()', MapCompose(custom_remove_tags))
     loader.add_value('dept_name', dept_name)
     # loader.add_xpath('hospital_name',
     #                  '//div[@class="item gray"]/span[1]/a/text()',
     #                  MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', hospital_name,
                      MapCompose(custom_remove_tags))
     loader.add_value('doctor_level', doctor_level)
     loader.add_xpath(
         'doctor_intro', '//div[@class="info-main"]/div[3]/span',
         MapCompose(remove_tags, custom_remove_tags, match_special))
     loader.add_xpath(
         'doctor_goodAt', '//div[@class="info-main"]/div[4]/span',
         MapCompose(remove_tags, custom_remove_tags, match_special))
     loader.add_value('dataSource_from', self.data_source_from)
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
예제 #6
0
    def parse_doctor_info_detail(self, response):
        self.logger.info('>>>>>>正在抓取医生详细信息>>>>>>')
        try:
            doctor_name = response.meta.get('doctor_name')
            dept_name = response.meta.get('dept_name')
            doctor_level = response.meta.get('doctor_level')
            hospital_name = response.meta.get('hospital_name')
            # hospital_name2 = response.xpath('//div[@class="yy_til"]/h2/text()').extract_first('')
            # hospital_name = hospital_name2 if hospital_name2 else hospital_name1
            diagnosis_amt = response.xpath(
                '//td/span[@class="doc_yuyue_time"]/a/@title').extract()
            if diagnosis_amt:
                res = re.search(r'.*挂号费:(.*?)$', diagnosis_amt[0], S)
                if res:
                    diagnosis_amt = res.group(1)
                else:
                    diagnosis_amt = None
            else:
                diagnosis_amt = None
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('doctor_level', doctor_level,
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath(
                'doctor_intro', '//div[@class="zrys"]/dl/dd',
                MapCompose(remove_tags, custom_remove_tags, clean_info2))
            loader.add_value('diagnosis_amt', diagnosis_amt)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            # has_reg_info = response.xpath('//td/span[@class="doc_yuyue_time"]').extract()
            # if has_reg_info:
            #     for each_reg_info in has_reg_info:
            #         reg_info_date = re.search(r'.*出诊时间:(.*?)\n', each_reg_info, S)
            #         reg_info_date = reg_info_date.group(1) if reg_info_date else None
            #         reg_info = '{0}-{1}'.format(now_year(), reg_info_date).replace('月', '-').replace('日', '')
            #         reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response)
            #         reg_loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags))
            #         reg_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
            #         reg_loader.add_xpath('hospital_name',
            #                              '//div[@class="yy_til"]/h2/text()',
            #                              MapCompose(custom_remove_tags))
            #         reg_loader.add_value('reg_info', reg_info, MapCompose(custom_remove_tags))
            #         reg_loader.add_value('update_time', now_day())
            #         reg_item = reg_loader.load_item()
            #         yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #7
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name',
                      '//div[@class="viewexpert_demo"]/p[1]/text()',
                      MapCompose(custom_remove_tags))
     loader.add_xpath('dept_name',
                      '//div[@class="viewexpert_demo"]/p[3]/text()',
                      MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath(
         'doctor_level', '//div[@class="viewexpert_demo"]/p[2]/text()',
         MapCompose(custom_remove_tags, match_special, match_special2))
     loader.add_xpath('doctor_intro', '//div[@class="viewexpert_detail"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_xpath('doctor_goodAt',
                      '//div[@class="viewexpert_demo"]/p[4]/text()',
                      MapCompose(custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取医生排班信息
     reg_tr_list = response.xpath(
         '//div[@class="viewexpert_detail"]/table/tr[position()>1]')
     is_has_reg = response.xpath(
         '//div[@class="viewexpert_detail"]/table/tr[position()>1]/td/img')
     reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日']
     if is_has_reg:
         for each_td in reg_tr_list:
             i = 0
             reg_time = each_td.xpath('td[1]/text()').extract_first('')
             all_reg_info = each_td.xpath('td[position()>1]')
             for each_reg_info in all_reg_info:
                 reg_info_date = reg_date[i]
                 i += 1
                 has_reg = each_reg_info.xpath('img')
                 if has_reg:
                     reg_info = '{0}{1}'.format(reg_info_date, reg_time)
                     reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                                response=response)
                     reg_loader.add_xpath(
                         'doctor_name',
                         '//div[@class="viewexpert_demo"]/p[1]/text()',
                         MapCompose(custom_remove_tags))
                     reg_loader.add_xpath(
                         'dept_name',
                         '//div[@class="viewexpert_demo"]/p[3]/text()',
                         MapCompose(custom_remove_tags, match_special))
                     reg_loader.add_value('hospital_name',
                                          self.hospital_name)
                     reg_loader.add_value('reg_info', reg_info)
                     reg_loader.add_value('update_time', now_day())
                     reg_item = reg_loader.load_item()
                     yield reg_item
예제 #8
0
 def parse_doctor_website(self, response):
     self.logger.info('>>>>>>正在抓取医生个人主页相关信息……')
     # 获取医生相关信息
     loader = YiHuLoader(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()')
     loader.add_xpath('dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
     loader.add_xpath('hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
     loader.add_xpath('doctor_level', '//div[@class="doctor-info"]/dl/dd[1]/text()')
     loader.add_xpath('doctor_intro', '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()')
     loader.add_xpath('doctor_goodAt', '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()')
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
예제 #9
0
    def parse_doctor_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院医生详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医生信息
            hospital_id = response.meta.get('hospital_id')
            doctor_info = json.loads(response.text)
            doctor_info_pages = doctor_info.get('data').get('pages')
            doctor_info_list = doctor_info.get('data').get('doctorPageList')
            current_page_num = re.search(r'&curr=(\d+)$', response.url)
            for each_doctor_info in doctor_info_list:
                portrait = each_doctor_info.get('portrait')
                doctor_photo_url = urljoin(self.doctor_image_host, portrait) if portrait else ''
                loader = CommonLoader2(item=DoctorInfoItem(), response=response)
                loader.add_value('doctor_name', each_doctor_info.get('name'), MapCompose(custom_remove_tags))
                loader.add_value('dept_name', each_doctor_info.get('departmentName'))
                loader.add_value('hospital_name', each_doctor_info.get('hospitalName'))
                loader.add_value('doctor_level', each_doctor_info.get('doctorTitleName'))
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_value('doctor_id', each_doctor_info.get('id'))
                loader.add_value('hospital_id', hospital_id)
                loader.add_value('doctor_photo_url', doctor_photo_url)
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                doctor_item = loader.load_item()
                yield doctor_item

            # 医生翻页
            if doctor_info_pages and current_page_num:
                current_page_num = int(current_page_num.group(1))
                total_pages = int(doctor_info_pages)
                next_page = current_page_num + 1
                if next_page <= total_pages:
                    next_doctor_url = self.doctor_url.format(str(hospital_id), str(next_page))
                    yield Request(next_doctor_url,
                                  headers=self.headers,
                                  callback=self.parse_doctor_info,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id
                                  })
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #10
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(self.hospital_name))
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name',
                      '//div[@id="info_title"]/text()',
                      MapCompose(custom_remove_tags, match_special))
     loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', self.hospital_name)
     # loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special))
     loader.add_xpath('doctor_intro',
                      '//div[@id="info_txt"]',
                      MapCompose(remove_tags, custom_remove_tags, filter_info2))
     loader.add_xpath('doctor_goodAt',
                      '//div[@id="info_txt"]',
                      MapCompose(remove_tags, custom_remove_tags, filter_info))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
예제 #11
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     doctor_name = response.meta['doctor_name']
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name)
     loader.add_xpath('dept_name', '//div[@class="doctor"]/h1/text()',
                      MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('doctor_level',
                      '//p[@class="profession"]/span/text()',
                      MapCompose(custom_remove_tags))
     loader.add_xpath('doctor_intro', '//div[@class="abstract"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_xpath('doctor_goodAt', '//div[@class="specialty"]/p/text()',
                      MapCompose(custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
예제 #12
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取:医生详细信息>>>>>>')
     try:
         diagnosis_fee = response.meta.get('diagnosis_fee')
         doctor_info = custom_remove_tags(
             remove_tags(''.join(
                 response.xpath('//td[@class="bk '
                                'titletxt11"]').extract())))
         doctor_intro1 = get_hospital_info(doctor_info, '个人简介:', '荣誉集锦:')
         doctor_intro2 = get_hospital_info(doctor_info, '个人简介:', '出诊时间:')
         doctor_intro = doctor_intro2 if doctor_intro2 else doctor_intro1
         loader = CommonLoader2(item=DoctorInfoItem(), response=response)
         loader.add_xpath(
             'doctor_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'dept_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()-1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()-2]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'doctor_level',
             '//span[@class="selecttxt"][contains(text(),"医师") or contains(text(),"专家")]/text()',
             MapCompose(custom_remove_tags))
         loader.add_value('doctor_intro', doctor_intro,
                          MapCompose(custom_remove_tags))
         loader.add_xpath(
             'doctor_goodAt',
             '//span[@class="titletxt11"]/b[contains(text(),"擅长")]/ancestor::span[1]/text()',
             MapCompose(remove_tags, custom_remove_tags))
         loader.add_value(
             'diagnosis_amt', diagnosis_fee,
             MapCompose(remove_tags, custom_remove_tags, get_number))
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('update_time', now_day())
         doctor_item = loader.load_item()
         yield doctor_item
     except Exception as e:
         self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #13
0
 def parse_doctor_info(self, response):
     """医生信息"""
     self.logger.info('正在抓取{}:医生信息'.format(self.hospital_name))
     doctor_list = response.xpath('//div[@class="contents2"]/ul/li')
     self.logger.info('该科室的医生个数为:{}'.format(str(len(doctor_list))))
     for each_doc in doctor_list:
         loader = MedicalMapLoader(item=DoctorInfoItem(), selector=each_doc)
         loader.add_xpath('doctor_name', 'h4[@class="name"]/text()')
         loader.add_value('hospital_name', self.hospital_name)
         loader.add_xpath('dept_name', 'p[@class="office"]/text()')
         loader.add_xpath('doctor_level', 'p[@class="post"]/text()')
         doctor_link = each_doc.xpath('a[1]/@href').extract_first('')
         if doctor_link:
             doctor_link = urljoin(self.host, doctor_link)
             self.headers['Referer'] = response.url
             yield Request(doctor_link,
                           headers=self.headers,
                           callback=self.parse_doctor_detail,
                           meta={'loader': loader})
예제 #14
0
 def parse_doctor_detail(self, response):
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_xpath(
         'doctor_name',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_xpath(
         'dept_name',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath(
         'doctor_level',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[3]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_xpath('doctor_intro',
                      '//div[@class="fleft wd740"]/div[1]/div[2]/div/p[1]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
     reg_info = response.xpath(
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[4]/text()'
     ).extract_first('')
     if reg_info:
         reg_info_list = get_reg_info(reg_info)
         for each_reg_info in reg_info_list:
             reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                        response=response)
             reg_loader.add_xpath(
                 'doctor_name',
                 '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()',
                 MapCompose(custom_remove_tags, match_special))
             reg_loader.add_xpath(
                 'dept_name',
                 '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()',
                 MapCompose(custom_remove_tags, match_special))
             reg_loader.add_value('hospital_name', self.hospital_name)
             reg_loader.add_value('reg_info', each_reg_info)
             reg_loader.add_value('update_time', now_day())
             reg_item = reg_loader.load_item()
             yield reg_item
예제 #15
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     dept_name = response.meta['dept_name']
     doctor_name = response.meta['doctor_name']
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name)
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('doctor_intro',
                      '//div[@class="right-about clearfix"]',
                      MapCompose(remove_tags, get_doctor_intro2))
     loader.add_xpath('doctor_goodAt',
                      '//div[@class="right-about clearfix"]',
                      MapCompose(remove_tags, get_doctor_good_at))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取排班信息
     self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format(
         self.hospital_name))
     reg_info = response.xpath(
         '//div[@class="right-about clearfix"]/p[contains(text(),"坐诊时间")]/text()'
         '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"坐诊时间")]/text()'
         '|//div[@class="right-about clearfix"]/p/span/strong[contains(text(),"坐诊时间")]/text()'
         '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"上午")]/text()'
         '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"下午")]/text()'
         '|//div[@class="right-about clearfix"]/p/strong/span[contains(text(),"坐诊时间")]/text()'
     ).extract_first('')
     if reg_info:
         reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                    response=response)
         reg_loader.add_value('doctor_name', doctor_name)
         reg_loader.add_value('dept_name', dept_name)
         reg_loader.add_value('hospital_name', self.hospital_name)
         reg_loader.add_value('reg_info', reg_info,
                              MapCompose(match_special, clean_info))
         reg_loader.add_value('update_time', now_day())
         reg_item = reg_loader.load_item()
         yield reg_item
예제 #16
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取:医生详细信息>>>>>>')
     try:
         doctor_name = response.meta.get('doctor_name')
         hospital_name = response.meta.get('hospital_name')
         doctor_level = response.meta.get('doctor_level')
         dept_name = response.meta.get('dept_name')
         loader = CommonLoader2(item=DoctorInfoItem(), response=response)
         loader.add_value('doctor_name', doctor_name)
         loader.add_value('dept_name', dept_name)
         loader.add_value('hospital_name', hospital_name)
         loader.add_value('doctor_level', doctor_level)
         loader.add_xpath('doctor_intro', '//p[@id="docSpeciality"]/text()',
                          MapCompose(custom_remove_tags, match_special))
         # loader.add_value('doctor_goodAt', '')
         # loader.add_value('diagnosis_amt', '')
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('crawled_url', response.url)
         loader.add_value('update_time', now_day())
         doctor_item = loader.load_item()
         yield doctor_item
     except Exception as e:
         self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #17
0
 def parse_doctor_info(self, response):
     """获取医院医生信息"""
     self.logger.info('正在抓取{}:医生信息'.format(self.hospital_name))
     all_doctors = response.xpath('//div[@class="pic"]')
     dept_name = response.xpath(
         '//div[@id="FrontPublic_breadCrumb01-1468402139239"]/div/'
         'a[4]/text()').extract_first('')
     if all_doctors:
         # 科室有医生
         for each_dept_name in all_doctors:
             dept_detail_link = each_dept_name.xpath(
                 'a/@href').extract_first('')
             loader = PxfybjyLoader(item=DoctorInfoItem(),
                                    response=response)
             loader.add_value('dept_name', dept_name)
             loader.add_value('hospital_name', self.hospital_name)
             if dept_detail_link:
                 dept_detail_link = urljoin(self.host, dept_detail_link)
                 request = Request(dept_detail_link,
                                   headers=self.headers,
                                   callback=self.parse_doctor_detail,
                                   meta={'loader': loader})
                 request.meta['Referer'] = response.url
                 yield request
예제 #18
0
 def parse_doctor_reg_info(self, response):
     self.logger.info('>>>>>>正在抓取医生排班信息……')
     loader = YiHuLoader(item=DoctorInfoItem(), response=response)
     loader.add_xpath('hospital_name', '//div[@class="link-555"]/a/text()')
     loader.add_xpath('dept_name', '//div[@class="hos-info"]/h1/text()')
예제 #19
0
    def parse_doctor_info(self, response):
        self.logger.info('>>>>>>正在抓取:医生信息>>>>>>')
        try:
            dept_name = response.meta.get('dept_name')
            hospital_name = response.meta.get('hospital_name')
            all_doctors = response.xpath('//div[@class="docInfo docInfo-w-h"]')
            for each_doctor_link in all_doctors:
                doctor_link = each_doctor_link.xpath(
                    'p[1]/a/@href').extract_first('')
                doctor_name = each_doctor_link.xpath(
                    'p[1]/a/text()').extract_first('')
                doctor_level = each_doctor_link.xpath(
                    'p[2]/text()').extract_first('')
                doctor_intro = each_doctor_link.xpath(
                    'p[contains(text(),"简介")]/text()').extract_first('')
                doctor_intro = match_special(doctor_intro)
                if doctor_intro:
                    if doctor_link:
                        doctor_link = urljoin(self.host, doctor_link)
                        self.headers['Referer'] = response.url
                        yield Request(doctor_link,
                                      headers=self.headers,
                                      callback=self.parse_doctor_info_detail,
                                      dont_filter=True,
                                      meta={
                                          'doctor_name': doctor_name,
                                          'doctor_level': doctor_level,
                                          'dept_name': dept_name,
                                          'hospital_name': hospital_name
                                      })
                else:
                    loader = CommonLoader2(item=DoctorInfoItem(),
                                           response=response)
                    loader.add_value('doctor_name', doctor_name)
                    loader.add_value('dept_name', dept_name)
                    loader.add_value('hospital_name', hospital_name)
                    loader.add_value('doctor_level', doctor_level)
                    # loader.add_value('doctor_intro', '')
                    # loader.add_xpath('doctor_goodAt', '')
                    # loader.add_value('diagnosis_amt', '')
                    loader.add_value('dataSource_from', self.data_source_from)
                    loader.add_value('crawled_url', response.url)
                    loader.add_value('update_time', now_day())
                    doctor_item = loader.load_item()
                    yield doctor_item

            # 分页
            has_next = response.xpath('//a[@class="pb_next"]')
            if has_next:
                # hos_id = re.search(r'HIS_CD=(.*?)&', response.url)
                # dept_id = re.search(r'DEP_ID=(.*?)$', response.url)
                hos_id = response.xpath(
                    '//input[@name="HIS_CD"]/@value').extract_first('')
                dept_id = response.xpath(
                    '//input[@name="DEP_ID"]/@value').extract_first('')
                if hos_id and dept_id:
                    # hos_id = hos_id.group(1)
                    # dept_id = dept_id.group(1)
                    now_page = response.xpath(
                        '//a[@class="pb_on"]/text()').extract_first('')
                    total_page = response.xpath(
                        '//a[contains(text(),"尾页")]/@pagval').extract_first('')
                    total_doctor_num = response.xpath(
                        '//input[@name="TOT_REC_NUM"]/@value').extract_first(
                            '')
                    if now_page and total_page and total_doctor_num:
                        next_page_num = int(now_page) + 1
                        total_page_num = int(total_page)
                        if next_page_num <= total_page_num:
                            next_page_link = self.next_doctor_url.format(
                                str(next_page_num), total_page,
                                total_doctor_num, hos_id, dept_id, timestamp())
                            self.headers['Referer'] = response.url
                            yield Request(next_page_link,
                                          headers=self.headers,
                                          callback=self.parse_doctor_info,
                                          dont_filter=True,
                                          meta={
                                              'dept_name': dept_name,
                                              'hospital_name': hospital_name
                                          })
        except Exception as e:
            self.logger.error('在抓取医生信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #20
0
    def parse_doctor_info_detail(self, response):
        hospital_name = response.meta.get('hospital_name')
        dept_name = response.meta.get('dept_name')
        self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医生信息
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_xpath(
                'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_name',
                '//div[@class="page_position"]/a[last()-1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath(
                'sex', '//td/b[contains(text(),"性别")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special,
                           clean_info2))
            loader.add_xpath(
                'doctor_level',
                '//td/b[contains(text(),"职称")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special,
                           clean_info2))
            loader.add_xpath(
                'doctor_intro',
                '//td/b[contains(text(),"医生简介")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, clean_info2))
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            self.logger.info(
                '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name))
            has_doctor_scheduling = response.xpath(
                '//td/div[@class="doctor-work"]')
            if has_doctor_scheduling:
                doctor_scheduling_tr = response.xpath(
                    '//table[@class="workTable"]/tbody/tr')
                all_scheduling_date = response.xpath(
                    '//table[@class="workTable"]/thead/tr/td[position()>1]'
                ).extract()
                scheduling_date_list = custom_remove_tags(
                    remove_tags(','.join(all_scheduling_date))).split(',')
                for each_td in doctor_scheduling_tr:
                    scheduling_time = each_td.xpath(
                        'td[1]/text()').extract_first('')
                    scheduling_info = each_td.xpath('td[position()>1]')
                    for index, each_s_i in enumerate(scheduling_info):
                        has_scheduling = each_s_i.xpath('div')
                        if has_scheduling:
                            each_scheduling_date = scheduling_date_list[index][
                                0:3]
                            reg_info = '{0}{1}'.format(each_scheduling_date,
                                                       scheduling_time)
                            reg_loader = CommonLoader2(
                                item=DoctorRegInfoItem(), response=response)
                            reg_loader.add_xpath(
                                'doctor_name',
                                '//td/b[contains(text(),"姓名")]/ancestor::td[1]',
                                MapCompose(remove_tags, custom_remove_tags,
                                           match_special))
                            reg_loader.add_value(
                                'dept_name', dept_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_xpath(
                                'hospital_name',
                                '//div[@class="page_position"]/a[last()-1]/text()',
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value('reg_info', reg_info)
                            reg_loader.add_value('dataSource_from',
                                                 self.data_source_from)
                            reg_loader.add_value('crawled_url', response.url)
                            reg_loader.add_value('update_time', now_day())
                            reg_item = reg_loader.load_item()
                            yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #21
0
    def parse_doctor_info_detail(self, response):
        hospital_name = response.meta.get('hospital_name')
        dept_name = response.meta.get('dept_name')
        doctor_name = response.meta.get('doctor_name')
        self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format(
            hospital_name, doctor_name))
        try:
            # 获取医生信息
            doctor_photo_url = response.xpath(
                '//div[@class="doctor_Img"]/img/@src').extract_first('')
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('doctor_level',
                             '//span[@class="object_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'doctor_intro', '//div[@class="doctor_Text_Major"]',
                MapCompose(remove_tags, custom_remove_tags, match_special2))
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_value('doctor_id', response.url,
                             MapCompose(match_special2))
            loader.add_xpath(
                'dept_id', '//div[@class="position_one"]/span/a[last()]/@href',
                MapCompose(match_special2))
            loader.add_xpath(
                'hospital_id',
                '//div[@class="position_one"]/span/a[last()-1]/@href',
                MapCompose(match_special2))
            loader.add_value('doctor_photo_url',
                             urljoin(self.host, doctor_photo_url))
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            self.logger.info(
                '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name))
            has_doctor_scheduling = response.xpath(
                '//span[@class="yuyue"]/a[contains(text(),"预约")]')
            if has_doctor_scheduling:
                doctor_scheduling_list = response.xpath(
                    '//div[@class="whliesubscribe"]/ul/li[1]/div/'
                    'span/text()').extract()
                doctor_scheduling_length = len(doctor_scheduling_list)
                all_scheduling_date = response.xpath(
                    '//div[@class="datetable"]/ul/li[position()>1]/'
                    'span[1]/text()').extract()
                scheduling_date_list = custom_remove_tags(
                    remove_tags(','.join(all_scheduling_date))).split(',')
                for i in range(1, doctor_scheduling_length + 1):
                    scheduling_info = response.xpath(
                        '//div[@class="whliesubscribe"]/ul/li[position()>1]'
                        '/div[{}]'.format(str(i)))
                    scheduling_time = doctor_scheduling_list[i - 1]
                    for index, each_s_i in enumerate(scheduling_info):
                        has_scheduling = each_s_i.xpath('span/a')
                        if has_scheduling:
                            each_scheduling_date = scheduling_date_list[index]
                            reg_info = '{0}-{1}{2}'.format(
                                now_year(), each_scheduling_date,
                                scheduling_time)
                            reg_loader = CommonLoader2(
                                item=DoctorRegInfoItem(), response=response)
                            reg_loader.add_value(
                                'doctor_name', doctor_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'dept_name', dept_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'hospital_name', hospital_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value('reg_info', reg_info)
                            reg_loader.add_value('dataSource_from',
                                                 self.data_source_from)
                            reg_loader.add_value('crawled_url', response.url)
                            reg_loader.add_value('update_time', now_day())
                            reg_loader.add_value('doctor_id', response.url,
                                                 MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'dept_id',
                                '//div[@class="position_one"]/span/a[last()]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'hospital_id',
                                '//div[@class="position_one"]/span/a[last()-1]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_value('gmt_created', now_time())
                            reg_loader.add_value('gmt_modified', now_time())
                            reg_item = reg_loader.load_item()
                            yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
예제 #22
0
    def parse_doctor_info(self, response):
        self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name))
        doctor_links = response.xpath('//li[@class="content column-num3"]')
        for each_doctor_link in doctor_links:
            doctor_level = dept_name = each_doctor_link.xpath(
                'div[2]/ul/li[2]/text()').extract()
            doctor_link = each_doctor_link.xpath(
                'div[1]/div/a/@href').extract_first('')
            loader = CommonLoader2(item=DoctorInfoItem(),
                                   selector=each_doctor_link)
            loader.add_xpath('doctor_name', 'div[1]/div/a/@title',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', self.hospital_name)
            if doctor_link:
                doctor_detail_request = Request(
                    urljoin(self.host, doctor_link),
                    headers=self.headers,
                    callback=self.parse_doctor_info_detail,
                    dont_filter=True,
                    meta={
                        'loader': loader,
                        'dept_name': dept_name,
                        'doctor_level': doctor_level
                    })
                self.headers['Referer'] = response.url
                yield doctor_detail_request

        # 医生信息下一页
        dept_id = re.search(r'.*pmcId=(.*?).html$', response.url)
        dept_id_2 = re.search(r'.*pmcId=(.*?)&pageNo_FrontProducts.*',
                              response.url)
        # 获取科室id
        if dept_id_2:
            dept_id = dept_id_2.group(1)
        elif dept_id:
            dept_id = dept_id.group(1)
        else:
            dept_id = ''

        # 获取页码
        page_no = response.xpath(
            '//a[contains(text(),"下一页")]/@onclick').extract_first('')
        if page_no:
            page_no = re.search(r'\((.*?)\)', page_no)
            if page_no:
                page_no = page_no.group(1).split(',')[0]
                next_page = 'http://www.slbjy.cn/expert_list/pmcId={}&pageNo_FrontProducts_list01-1482202374862={}' \
                            '&pageSize_FrontProducts_list01-1482202374862=12.html'
                next_page_link = next_page.format(dept_id, page_no)
                next_request = Request(next_page_link,
                                       headers=self.headers,
                                       callback=self.parse_doctor_info)
                self.headers['Referer'] = response.url
                yield next_request

        # 其他科室医生信息
        self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name))
        doctor_links = response.xpath(
            '//div[@class="menu-first"]/ul/li[position()>1]/a/@href').extract(
            )
        self.doctor_crawled_cnt += 1
        if doctor_links and self.doctor_crawled_cnt == 1:
            for each_doctor_link in doctor_links:
                doctor_request = Request(urljoin(self.host, each_doctor_link),
                                         headers=self.headers,
                                         callback=self.parse_doctor_info,
                                         dont_filter=True)
                self.headers['Referer'] = response.url
                yield doctor_request
예제 #23
0
 def parse_doctor_website(self, response):
     self.logger.info('>>>>>>正在抓取医生个人主页相关信息……')
     # 获取医生相关信息
     loader = YiHuLoader(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()')
     loader.add_xpath('dept_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
     loader.add_xpath('hospital_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
     loader.add_xpath('doctor_level',
                      '//div[@class="doctor-info"]/dl/dd[1]/text()')
     loader.add_xpath(
         'doctor_intro',
         '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()')
     loader.add_xpath('doctor_goodAt',
                      '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()')
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取医院相关信息
     hos_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first(
             '')
     dept_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first(
             '')
     # 抓取医院详细信息
     if hos_link:
         hos_id = re.search(r'/sc/(.*?).shtml', hos_link)
         if hos_id and hos_id.group(1) not in self.crawled_ids:
             self.crawled_ids.add(hos_id.group(1))
             hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link)
             hos_con_link = re.sub(r'/sc/', '/contact/', hos_link)
             hos_loader = YiHuLoader(item=HospitalInfoItem(),
                                     response=response)
             hos_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             hospital_detail_request = Request(
                 hos_intro_link,
                 headers=self.headers,
                 callback=self.parse_hospital_detail,
                 meta={
                     'loader': hos_loader,
                     'contact_hos_link': hos_con_link
                 })
             hospital_detail_request.meta['Referer'] = response.url
             yield hospital_detail_request
     # 存储科室信息
     if dept_link:
         dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link)
         if dept_link_id and dept_link_id.group(1) not in self.crawled_dept:
             self.crawled_dept.add(dept_link_id.group(1))
             dept_loader = YiHuLoader(item=HospitalDepItem(),
                                      response=response)
             dept_loader.add_xpath(
                 'dept_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
             dept_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             dept_loader.add_value('update_time', now_day())
             dept_item = dept_loader.load_item()
             yield dept_item