Exemplo n.º 1
0
    def parse_doctor_info_detail(self, response):
        self.logger.info('>>>>>>正在抓取医生详细信息>>>>>>')
        try:
            doctor_name = response.meta.get('doctor_name')
            dept_name = response.meta.get('dept_name')
            # dept_name = dept_name.split('-')[-1] if '-' in dept_name else dept_name
            doctor_level = response.meta.get('doctor_level')
            hospital_name = response.meta.get('hospital_name')
            # hospital_name2 = response.xpath('//div[@class="yy_til"]/h2/text()').extract_first('')
            # hospital_name = hospital_name2 if hospital_name2 else hospital_name1
            diagnosis_amt = response.xpath('//td/span[@class="doc_yuyue_time"]/a/@title').extract()
            if diagnosis_amt:
                res = re.search(r'.*挂号费:(.*?)$', diagnosis_amt[0], S)
                if res:
                    diagnosis_amt = res.group(1)
                else:
                    diagnosis_amt = None
            else:
                diagnosis_amt = None
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags))
            loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
            loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('doctor_intro',
                             '//div[@class="zrys"]/dl/dd',
                             MapCompose(remove_tags, custom_remove_tags, clean_info2))
            loader.add_value('diagnosis_amt', diagnosis_amt)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            has_reg_info = response.xpath('//td/span[@class="doc_yuyue_time"]').extract()
            if has_reg_info:
                for each_reg_info in has_reg_info:
                    reg_info_date = re.search(r'.*出诊时间:(.*?)\n', each_reg_info, S)
                    reg_info_date = reg_info_date.group(1) if reg_info_date else None
                    reg_info = '{0}-{1}'.format(now_year(), reg_info_date).replace('月', '-').replace('日', '')
                    reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response)
                    reg_loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags))
                    reg_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
                    reg_loader.add_xpath('hospital_name',
                                         '//div[@class="yy_til"]/h2/text()',
                                         MapCompose(custom_remove_tags))
                    reg_loader.add_value('reg_info', reg_info, MapCompose(custom_remove_tags))
                    reg_loader.add_value('dataSource_from', self.data_source_from)
                    reg_loader.add_value('crawled_url', response.url)
                    reg_loader.add_value('update_time', now_day())
                    reg_item = reg_loader.load_item()
                    yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
Exemplo n.º 2
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name',
                      '//div[@class="viewexpert_demo"]/p[1]/text()',
                      MapCompose(custom_remove_tags))
     loader.add_xpath('dept_name',
                      '//div[@class="viewexpert_demo"]/p[3]/text()',
                      MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath(
         'doctor_level', '//div[@class="viewexpert_demo"]/p[2]/text()',
         MapCompose(custom_remove_tags, match_special, match_special2))
     loader.add_xpath('doctor_intro', '//div[@class="viewexpert_detail"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_xpath('doctor_goodAt',
                      '//div[@class="viewexpert_demo"]/p[4]/text()',
                      MapCompose(custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取医生排班信息
     reg_tr_list = response.xpath(
         '//div[@class="viewexpert_detail"]/table/tr[position()>1]')
     is_has_reg = response.xpath(
         '//div[@class="viewexpert_detail"]/table/tr[position()>1]/td/img')
     reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日']
     if is_has_reg:
         for each_td in reg_tr_list:
             i = 0
             reg_time = each_td.xpath('td[1]/text()').extract_first('')
             all_reg_info = each_td.xpath('td[position()>1]')
             for each_reg_info in all_reg_info:
                 reg_info_date = reg_date[i]
                 i += 1
                 has_reg = each_reg_info.xpath('img')
                 if has_reg:
                     reg_info = '{0}{1}'.format(reg_info_date, reg_time)
                     reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                                response=response)
                     reg_loader.add_xpath(
                         'doctor_name',
                         '//div[@class="viewexpert_demo"]/p[1]/text()',
                         MapCompose(custom_remove_tags))
                     reg_loader.add_xpath(
                         'dept_name',
                         '//div[@class="viewexpert_demo"]/p[3]/text()',
                         MapCompose(custom_remove_tags, match_special))
                     reg_loader.add_value('hospital_name',
                                          self.hospital_name)
                     reg_loader.add_value('reg_info', reg_info)
                     reg_loader.add_value('update_time', now_day())
                     reg_item = reg_loader.load_item()
                     yield reg_item
Exemplo n.º 3
0
 def parse_doctor_info(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name))
     tr_res1 = response.xpath('//div[@class="text"]/table[1]/tbody/tr[position()>9]')
     tr_res2 = response.xpath('//div[@class="text"]/table[2]/tbody/tr[position()>1]')
     dept_name = ''
     try:
         # 获取医生信息
         for each_res in tr_res1:
             loader = CommonLoader2(item=DoctorInfoItem(), response=response)
             loader.add_value('hospital_name', self.hospital_name)
             td_cnt = each_res.xpath('td/span[not(contains(text(),"全天")) '
                                     'and not(contains(text(),"上午")) '
                                     'and not(contains(text(),"下午")) '
                                     'and not(contains(text(),"\u3000"))]/text()')
             length_of_list = len(td_cnt)
             if '名医堂' in td_cnt:
                 loader.add_value('dept_name', td_cnt.extract()[0])
                 loader.add_value('doctor_name', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 3:
                 dept_name = td_cnt.extract()[0].replace(' ', '')
                 loader.add_value('dept_name', dept_name)
                 loader.add_value('doctor_name', td_cnt.extract()[1])
                 loader.add_value('doctor_level', td_cnt.extract()[2])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 2:
                 loader.add_value('dept_name', dept_name)
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('doctor_level', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             else:
                 pass
         for each_res in tr_res2:
             loader = CommonLoader2(item=DoctorInfoItem(), response=response)
             loader.add_value('hospital_name', self.hospital_name)
             td_cnt = each_res.xpath('td[position()<4]/span/text()')
             length_of_list = len(td_cnt)
             if '名医堂' in td_cnt:
                 loader.add_value('dept_name', td_cnt.extract()[1])
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             elif length_of_list == 3:
                 loader.add_value('dept_name', td_cnt.extract()[2])
                 loader.add_value('doctor_name', td_cnt.extract()[0])
                 loader.add_value('doctor_level', td_cnt.extract()[1])
                 loader.add_value('update_time', now_day())
                 yield loader.load_item()
             else:
                 pass
     except Exception as e:
         self.logger.error('>>>>>>抓取过程中出错了,原因是:{}>>>>>>'.format(repr(e)))
Exemplo n.º 4
0
 def parse_area_detail(self, response):
     """地区排行详细信息"""
     self.logger.info('>>>>>>正在抓取地区排行详细信息……>>>>>>')
     subject_name = response.meta.get('subject_name')
     res = json.loads(response.text)
     for each_data in res.get('rows', []):
         loader = CommonLoader2(item=AreaRankingItem(), response=response)
         loader.add_value('subject', each_data.get('GB_NAME'))
         loader.add_value('hospital_pro', subject_name)
         loader.add_value('ranking', each_data.get('SHOW_RANK'))
         loader.add_value('hospital_name', each_data.get('HOSPNAME'))
         loader.add_value('create_time', now_day())
         loader.add_value('update_time', now_day())
         ranking_item = loader.load_item()
         yield ranking_item
Exemplo n.º 5
0
 def parse_doctor_info_detail(self, response):
     hospital_name = response.meta.get('hospital_name')
     self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
     dept_name = response.meta.get('dept_name')
     doctor_level = response.meta.get('doctor_level')
     doctor_name = response.meta.get('doctor_name')
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name,
                      MapCompose(custom_remove_tags))
     # loader.add_xpath('doctor_name', '//span[@class="info-name"]/text()', MapCompose(custom_remove_tags))
     loader.add_value('dept_name', dept_name)
     # loader.add_xpath('hospital_name',
     #                  '//div[@class="item gray"]/span[1]/a/text()',
     #                  MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', hospital_name,
                      MapCompose(custom_remove_tags))
     loader.add_value('doctor_level', doctor_level)
     loader.add_xpath(
         'doctor_intro', '//div[@class="info-main"]/div[3]/span',
         MapCompose(remove_tags, custom_remove_tags, match_special))
     loader.add_xpath(
         'doctor_goodAt', '//div[@class="info-main"]/div[4]/span',
         MapCompose(remove_tags, custom_remove_tags, match_special))
     loader.add_value('dataSource_from', self.data_source_from)
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
Exemplo n.º 6
0
 def parse(self, response):
     self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name))
     loader = MedicalMapLoader(item=HospitalInfoItem(), response=response)
     # loader.add_value('hospital_id', self.hospital_id)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '普通门诊上午_8:00-12:00;普通门诊下午13:00-16:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '综合医院')
     # loader.add_value('hospital_addr', '四川省金堂县赵镇金广路886号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '金堂县')
     loader.add_value('hospital_phone', '医院服务电话_028-84902884;服务质量监督投诉电话_028-84932532;'
                                        '急诊急救电话_18181938532;产科急救电话_18181938532;'
                                        '医保结算电话_028-84932721;'
                                        '预约挂号电话_028-84931443;预约挂号电话_028-84902884;预约挂号电话_028-61568616')
     loader.add_xpath('hospital_intro', '//div[@class="baseRight-intro"]/p[position()<7]/span/text()')
     loader.add_value('is_medicare', '')
     loader.add_value('medicare_type', '')
     loader.add_value('vaccine_name', '')
     loader.add_value('is_cpc', '')
     loader.add_value('is_bdc', '')
     loader.add_value('cooperative_business', '')
     loader.add_value('hospital_district', '')
     loader.add_value('registered_channel', '微信公众号_' + self.hospital_name)
     loader.add_value('dataSource_from', '官网:http://www.jintangyy.com/index.aspx')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
     request = Request(self.dep_link, callback=self.parse_hospital_dep)
     request.meta['Referer'] = response.url
     yield request
Exemplo n.º 7
0
 def parse_doctor_reg_info(self, response):
     self.logger.info('>>>>>>正在抓取医生排班信息……')
     doctor_reg_info = json.loads(response.text)
     reg_info_list = doctor_reg_info['data']['selWork']
     doctor_name = doctor_reg_info['data']['doctor'][0].get('doctorName', '')
     hospital_name = doctor_reg_info['data']['doctor'][0].get('hospitalName', '')
     dept_name = doctor_reg_info['data']['doctor'][0].get('deptName', '')
     for each_reg_info in reg_info_list:
         duty_date = each_reg_info['dutydate']
         sel_works = each_reg_info['selWorks']
         for each_work_info in sel_works:
             duty_time = each_work_info['dutytime']
             if int(duty_time) == 1:
                 duty_time = '上午'
             elif int(duty_time) == 3:
                 duty_time = '上午'
             else:
                 # duty_time 4 晚上 doctorId 3329 成都中医药大学附属医院
                 duty_time = '晚上'
             reg_info = '{0}{1}'.format(duty_date, duty_time)
             loader = CommonLoader(item=DoctorRegInfoItem(), response=response)
             loader.add_value('doctor_name', doctor_name)
             loader.add_value('hospital_name', hospital_name)
             loader.add_value('dept_name', dept_name)
             loader.add_value('reg_info', reg_info)
             loader.add_value('update_time', now_day())
             reg_info_item = loader.load_item()
             yield reg_info_item
Exemplo n.º 8
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     loader = response.meta['loader']
     dept_name1 = custom_remove_tags(''.join(response.meta['dept_name']))
     doctor_level2 = response.xpath(
         '//div[@class="FrontProducts_detail02-'
         '1482202997396_htmlbreak"]/p[1]/strong/text()').extract_first('')
     doctor_level1 = response.meta['doctor_level']
     dept_name2 = response.xpath(
         '//div[@id="FrontPublic_breadCrumb01-1482202386120"]/div/'
         'a[last()]/text()').extract_first('').replace('专家', '').replace(
             '类', '科')
     dept_name = re.sub(r'中医医师|中西医医师', '中医科',
                        dept_name1) if dept_name1 else dept_name2
     doctor_level = custom_remove_tags(
         ''.join(doctor_level1)) if doctor_level1 else doctor_level2
     doctor_intro = response.xpath(
         '//div[@class="FrontProducts_detail02-'
         '1482202997396_htmlbreak"]/p[2]').extract_first('')
     loader.add_value('dept_name', dept_name,
                      MapCompose(custom_remove_tags, filter_info3))
     loader.add_value('doctor_level', doctor_level,
                      MapCompose(filter_info4, custom_remove_tags))
     loader.add_value('doctor_intro', doctor_intro,
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
Exemplo n.º 9
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30')
     loader.add_value('hospital_level', '二级甲等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '综合医院')
     loader.add_value('hospital_addr', '成都市东三环龙泉驿区十陵街道江华路8号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '龙泉驿区')
     loader.add_value(
         'hospital_phone', '急救电话_028-84615120;电话咨询_028-84604546转科室;'
         '24小时医护热线_028-84615789')
     loader.add_xpath('hospital_intro', '//article[@class="content"]/div',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('is_medicare', '是')
     loader.add_value('medicare_type', '成都市医保、工伤保险定点医院')
     loader.add_value('registered_channel', '官网或官方微信公众号(工作日),法定节假日电话预约')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Exemplo n.º 10
0
 def parse_doctor_reg_info(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format(
         self.hospital_name))
     doctor_name = response.meta['doctor_name']
     dept_name = response.meta['dept_name']
     reg_tr_list = response.xpath('//table/tr[position()>1]')
     is_has_reg = response.xpath('//table/tr[position()>1]/td/img')
     # reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日']
     reg_col = ['上午', '下午', '晚班']
     if is_has_reg:
         for each_td in reg_tr_list:
             reg_time = each_td.xpath('td[1]/text()').extract_first('')
             all_reg_info = each_td.xpath('td[position()>1]')
             for index, each_reg_info in enumerate(all_reg_info):
                 reg_info_date = reg_col[index]
                 has_reg = each_reg_info.xpath('img')
                 if has_reg:
                     reg_info = '{0}{1}'.format(reg_time, reg_info_date)
                     reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                                response=response)
                     reg_loader.add_value('doctor_name', doctor_name)
                     reg_loader.add_value(
                         'dept_name', dept_name,
                         MapCompose(custom_remove_tags, match_special))
                     reg_loader.add_value('hospital_name',
                                          self.hospital_name)
                     reg_loader.add_value('reg_info', reg_info)
                     reg_loader.add_value('update_time', now_day())
                     reg_item = reg_loader.load_item()
                     yield reg_item
Exemplo n.º 11
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     doctor_name = response.meta['doctor_name']
     dept_name = response.meta['dept_name']
     doctor_level = response.meta['doctor_level']
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_value('doctor_name', doctor_name)
     loader.add_value('dept_name', dept_name,
                      MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('doctor_level', doctor_level,
                      MapCompose(custom_remove_tags, match_special))
     loader.add_xpath('doctor_intro', '//div[@id="about-right-b"]/p',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_xpath('doctor_goodAt', '//div[@id="about-right-b"]/p',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 医生排班信息
     params = re.search(r'.*\?(.*?)$', response.url)
     reg_url = 'http://www.scpz120.com/ajax/Doctor.aspx?'
     if params:
         reg_link = '{0}{1}'.format(reg_url,
                                    params.group(1).replace('&id', '&kid'))
         reg_request = Request(reg_link,
                               headers=self.headers,
                               callback=self.parse_doctor_reg_info,
                               meta={
                                   'doctor_name': doctor_name,
                                   'dept_name': dept_name
                               })
         self.headers['Referer'] = response.url
         yield reg_request
Exemplo n.º 12
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '妇幼保健院')
     loader.add_value('hospital_addr', '成都市温江区万春路140')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '24小时急救电话_028-82723131;咨询电话_围产期保健_028-82715727;'
                                        '咨询电话_妇科门诊_028-82711383;咨询电话_儿童保健_028-82711527;'
                                        '咨询电话_婚检科_028-82720337;'
                                        '投诉电话_028-82724901(上班时间);投诉电话_13688488598(下班时间)')
     loader.add_xpath('hospital_intro',
                      '//div[@id="info_txt"]',
                      MapCompose(remove_tags, custom_remove_tags))
     # loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '')
     loader.add_value('registered_channel', '电话预约;挂号窗口;医院微信公众号')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Exemplo n.º 13
0
    def parse_hospital_dep_detail(self, response):
        self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
            self.hospital_name))
        loader = CommonLoader2(item=HospitalDepItem(), response=response)
        loader.add_xpath('dept_type', '//div[@class="title"]/h3/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath('dept_name', '//div[@class="title"]/h3/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_name', self.hospital_name)
        # loader.add_value('dept_type', response.meta['dept_type'], MapCompose(custom_remove_tags))
        loader.add_xpath('dept_info', '//div[@class="content"]',
                         MapCompose(remove_tags, custom_remove_tags))
        loader.add_value('update_time', now_day())
        dept_item = loader.load_item()
        yield dept_item
        # 其他科室信息
        self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name))
        dept_links = response.xpath(
            '//ul[@class="list2"]/li[position()>1]/a/@href').extract()
        self.dept_crawled_cnt += 1

        if dept_links and self.dept_crawled_cnt == 1:
            for each_dept_link in dept_links:
                dept_request = Request(urljoin(self.host, each_dept_link),
                                       headers=self.headers,
                                       callback=self.parse_hospital_dep_detail,
                                       dont_filter=True)
                dept_request.meta['Referer'] = response.url
                yield dept_request
Exemplo n.º 14
0
 def parse_doctor_info_detail(self, response):
     hospital_name = response.meta.get('hospital_name')
     self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
     try:
         doctor_name = response.meta.get('doctor_name')
         dept_name = response.meta.get('dept_name')
         doctor_level = response.meta.get('doctor_level')
         doc_gt1 = remove_tags(''.join(
             response.xpath('//div[@class="intro_more"]').extract()))
         doc_gt2 = response.xpath(
             '//dd[contains(text(),"擅长领域")]/text()').extract_first('')
         doctor_good_at = doc_gt1.replace('[关闭]',
                                          '') if doc_gt1 else doc_gt2
         loader = CommonLoader2(item=DoctorInfoItem(), response=response)
         loader.add_value('doctor_name', doctor_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('dept_name', dept_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('hospital_name', hospital_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('doctor_level', doctor_level,
                          MapCompose(custom_remove_tags))
         loader.add_xpath('doctor_intro', '//div[@class="hos-guide-box1"]',
                          MapCompose(remove_tags, custom_remove_tags))
         loader.add_value(
             'doctor_goodAt', doctor_good_at,
             MapCompose(custom_remove_tags, match_special, clean_info2))
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('crawled_url', response.url)
         loader.add_value('update_time', now_day())
         doctor_item = loader.load_item()
         yield doctor_item
     except Exception as e:
         self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
Exemplo n.º 15
0
 def parse_hospital_dep(self, response):
     hospital_name = response.meta.get('hospital_name')
     dept_type = response.meta.get('dept_type')
     self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type))
     try:
         dept_info = json.loads(response.text)
         sub_dept_list = dept_info.get('data').get('subDepList')
         for each_dept_info in sub_dept_list:
             dept_name = each_dept_info.get('name')
             dept_id = each_dept_info.get('id')
             dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
             dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags))
             dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dataSource_from', self.data_source_from)
             dept_loader.add_value('crawled_url', response.url)
             dept_loader.add_value('update_time', now_day())
             dept_loader.add_value('dept_id', dept_id)
             dept_loader.add_value('dept_url', response.url)
             dept_loader.add_value('gmt_created', now_time())
             dept_loader.add_value('gmt_modified', now_time())
             dept_item = dept_loader.load_item()
             yield dept_item
     except Exception as e:
         self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
Exemplo n.º 16
0
 def parse_hospital_dep(self, response):
     """
     获取科室信息
     """
     self.logger.info('>>>>>>正在抓取科室信息……')
     hospital_name = response.xpath('//div[@class="hos-info"]/h1/text()').extract_first('')
     all_dept_links = response.xpath('//dd[@class="ks-2"]/ul/li')
     self.logger.info('{}:共有{}个科室'.format(hospital_name, str(len(all_dept_links))))
     for each_dept_link in all_dept_links:
         # 获取科室信息
         dep_loader = YiHuLoader(item=HospitalDepItem(), selector=each_dept_link)
         dep_loader.add_xpath('dept_type', 'a/text()')
         dep_loader.add_xpath('dept_name', 'a/text()')
         dep_loader.add_value('hospital_name', hospital_name)
         dep_loader.add_value('update_time', now_day())
         dep_item = dep_loader.load_item()
         yield dep_item
         # 获取科室医生信息
         dept_link = each_dept_link.xpath('a/@href').extract_first('')
         if dept_link:
             dept_link = urljoin(self.host, dept_link)
             # doctor_link = re.sub(r'/arrange/', '/7002/', dept_link)
             dept_request = Request(dept_link,
                                    headers=self.headers,
                                    callback=self.parse_dept_link)
             self.headers['Referer'] = response.url
             yield dept_request
Exemplo n.º 17
0
 def parse_dept_info(self, response):
     dep_type = response.meta['dep_type']
     self.logger.info('正在抓取[{}]科室信息'.format(custom_remove_tags(dep_type)))
     all_dept_names = response.xpath('//div[@class="pic"]')
     if all_dept_names:
         # 一级科室有二级科室
         for each_dept_name in all_dept_names:
             dept_detail_link = each_dept_name.xpath(
                 'a/@href').extract_first('')
             if dept_detail_link:
                 dept_detail_link = urljoin(self.host, dept_detail_link)
                 request = Request(dept_detail_link,
                                   headers=self.headers,
                                   callback=self.parse_dept_detail,
                                   meta={'dep_type': dep_type})
                 request.meta['Referer'] = response.url
                 yield request
     else:
         # 一级科室没有二级科室
         loader = PxfybjyLoader(item=HospitalDepItem(), response=response)
         loader.add_value('dept_type', dep_type)
         loader.add_value('hospital_name', self.hospital_name)
         loader.add_value('update_time', now_day())
         hospital_dep_item = loader.load_item()
         yield hospital_dep_item
Exemplo n.º 18
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value(
         'consulting_hour', '急诊和临床住院科室_24小时值班;'
         '行政及其它_上午8:00~12:00,下午14:00~17:00')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '中医医院')
     loader.add_value('hospital_addr', '四川省彭州市天彭镇南大街396号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '彭州市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '028-83701908')
     loader.add_xpath(
         'hospital_intro', '//div[@id="about-right-b"]',
         MapCompose(remove_tags, custom_remove_tags, clean_info))
     # loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '成都市医保、工伤保险定点医院')
     loader.add_value('registered_channel', '官网')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Exemplo n.º 19
0
    def parse_hospital_dep(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name))
        try:
            all_dept_links = response.xpath(
                '//div[@id="one_2"]/div/div/table/tbody/tr/td[@class="contentTd"]/a'
            )
            for each_dept_link in all_dept_links:
                dept_name = each_dept_link.xpath('text()').extract_first('')
                dept_detail_link = each_dept_link.xpath('@href').extract_first(
                    '')
                dept_loader = CommonLoader2(item=HospitalDepItem(),
                                            response=response)
                dept_loader.add_value('dept_name', dept_name,
                                      MapCompose(custom_remove_tags))
                dept_loader.add_value('hospital_name', hospital_name,
                                      MapCompose(custom_remove_tags))
                dept_loader.add_value('dataSource_from', self.data_source_from)
                dept_loader.add_value('update_time', now_day())

                # 获取科室详细信息
                if dept_name and dept_detail_link:
                    self.headers['Referer'] = response.url
                    yield Request(urljoin(self.host, dept_detail_link),
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep_detail,
                                  meta={
                                      'dept_name': dept_name,
                                      'dept_loader': dept_loader,
                                      'hospital_name': hospital_name
                                  },
                                  dont_filter=True)
        except Exception as e:
            self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
Exemplo n.º 20
0
 def parse_dept_info(self, response):
     self.logger.info('>>>>>>正在抓取医院科室相关信息……')
     dept_info = json.loads(response.text)
     for each_dept in dept_info['responseData']['data']['data']['depart']:
         loader = CommonLoader(item=HospitalDepItem(), response=response)
         loader.add_value('dept_name', each_dept['deptname'])
         loader.add_value('hospital_name', dept_info['responseData']['data']['data']['hospital']['hospitalName'])
         loader.add_value('update_time', now_day())
         dept_item = loader.load_item()
         yield dept_item
         dept_id = each_dept.get('deptid', '')
         if dept_id:
             data = {
                 'key': '',
                 'deptId': str(dept_id),
                 'pageIndex': '1',
                 'pageSize': '100'
             }
             doctor_request = FormRequest(self.doctor_link,
                                          headers=self.headers,
                                          callback=self.parse_doctor_info,
                                          formdata=data,
                                          meta={'dept_id': dept_id},
                                          dont_filter=True)
             self.headers['Referer'] = 'http://www.scgh114.com/web/register/doctor'
             yield doctor_request
Exemplo n.º 21
0
 def parse(self, response):
     """获取医院相关信息"""
     self.logger.info('>>>>>>正在抓取医院相关信息……')
     hospital_info = json.loads(response.text)
     for each_hospital in hospital_info[3:4]:
         is_medicare = '是' if str(each_hospital.get('Ismedicalcard', '')) == '1' else '否'
         loader = CommonLoader(item=HospitalInfoItem(), response=response)
         loader.add_value('hospital_name', each_hospital.get('hospitalname', ''))
         loader.add_value('hospital_level', each_hospital.get('levelName', ''))
         loader.add_value('hospital_addr', each_hospital.get('address', ''))
         loader.add_value('hospital_pro', '四川')
         loader.add_value('hospital_city', each_hospital.get('areaName', ''))
         loader.add_value('is_medicare', is_medicare)
         loader.add_value('dataSource_from', self.source_from)
         loader.add_value('update_time', now_day())
         hospital_item = loader.load_item()
         yield hospital_item
         hospital_id = each_hospital.get('hospitalid')
         if hospital_id:
             dept_request = FormRequest(self.dept_link,
                                        headers=self.headers,
                                        callback=self.parse_dept_info,
                                        formdata={'hospitalId': str(hospital_id)},
                                        dont_filter=True)
             self.headers['Referer'] = 'http://www.scgh114.com/web/register/gh'
             yield dept_request
Exemplo n.º 22
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '上午8:00—12:00;下午2:00—5:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '妇幼保健院')
     loader.add_value('hospital_addr', '四川省成都市双流区东升街道涧槽中街396号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '')
     loader.add_value(
         'hospital_phone', '母婴咨询热线_028-85884888(工作日);'
         '总值班电话_028-85808438;'
         '预约挂号电话_028-85801029(7:30-19:30)')
     loader.add_xpath('hospital_intro', '//div[@class="describe htmledit"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '')
     loader.add_value('registered_channel', '电话预约;自助挂号机;诊室预约;'
                      '医院微信公众号;健康双流;现场')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Exemplo n.º 23
0
 def parse(self, response):
     """综合排行"""
     self.logger.info('>>>>>>正在抓取综合排行信息……>>>>>>')
     res = json.loads(response.text)
     for each_data in res.get('rows', []):
         loader = CommonLoader2(item=ComprehensiveRankingItem(),
                                response=response)
         loader.add_value('hospital_pro', each_data.get('PROVINCE'))
         loader.add_value('ranking', each_data.get('RANK'))
         loader.add_value('hospital_name', each_data.get('HOSPNAME'))
         loader.add_value('tech_investment', each_data.get('INPUT'))
         loader.add_value('tech_output', each_data.get('OUTPUT'))
         loader.add_value('academic_influence', each_data.get('INFLUENCE'))
         loader.add_value('total_score', each_data.get('SUM'))
         loader.add_value('create_time', now_day())
         loader.add_value('update_time', now_day())
         ranking_item = loader.load_item()
         yield ranking_item
Exemplo n.º 24
0
 def parse_dep_detail(self, response):
     """科室详细信息"""
     self.logger.info('正在抓取{}:科室详细信息'.format(self.hospital_name))
     loader = response.meta['loader']
     dept_intro = response.xpath('//div[@class="baseRight-intro"]/p').extract()
     loader.add_value('dept_info', dept_intro)
     loader.add_value('update_time', now_day())
     hospital_dep_item = loader.load_item()
     yield hospital_dep_item
Exemplo n.º 25
0
 def parse_doctor_detail(self, response):
     self.logger.info('>>>>>>正在抓取医生详细信息……')
     loader = response.meta['loader']
     doctor_detail = json.loads(response.text)
     loader.add_value('doctor_intro', doctor_detail['data'].get('extDetails', ''))
     loader.add_value('doctor_goodAt', doctor_detail['data'].get('extExperts', ''))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
Exemplo n.º 26
0
 def parse_hospital_dep_detail(self, response):
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('dept_info', '//div[@class="fleft wd740"]')
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
Exemplo n.º 27
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院信息>>>>>>')

        try:
            # 获取区或县
            hospital_address = response.xpath(
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()'
            ).extract_first('')
            if hospital_address:
                hospital_county = get_county2('中国|江苏省|江苏|南京市|南京',
                                              hospital_address)
            else:
                hospital_county = None

            # 获取医院信息
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//div[@class="yy_til"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level',
                             response.meta.get('hospital_level'),
                             MapCompose(custom_remove_tags, clean_info))
            loader.add_xpath(
                'hospital_addr',
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '江苏省')
            loader.add_value('hospital_city', '南京市')
            loader.add_value('hospital_county', hospital_county)
            loader.add_xpath(
                'hospital_phone',
                '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_intro', '//dd[@id="wrap"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>')
            all_dept_links = response.xpath(
                '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract()
            for each_dept_link in all_dept_links:
                dept_link = urljoin(
                    self.host,
                    re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link))
                self.headers['Referer'] = response.url
                yield Request(dept_link,
                              headers=self.headers,
                              callback=self.parse_hospital_dep_detail)
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
Exemplo n.º 28
0
 def parse_doctor_detail(self, response):
     """医生详细信息"""
     self.logger.info('正在抓取{}:医生详细信息'.format(self.hospital_name))
     loader = response.meta['loader']
     doctor_intro = response.xpath('//div[@class="article"]/text()').extract_first('')
     loader.add_value('doctor_intro', doctor_intro)
     loader.add_value('doctor_goodAt', doctor_intro)
     loader.add_value('update_time', now_day())
     doctor_info_item = loader.load_item()
     yield doctor_info_item
Exemplo n.º 29
0
 def parse_doctor_detail(self, response):
     loader = CommonLoader2(item=DoctorInfoItem(), response=response)
     loader.add_xpath(
         'doctor_name',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_xpath(
         'dept_name',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath(
         'doctor_level',
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[3]/text()',
         MapCompose(custom_remove_tags, match_special))
     loader.add_xpath('doctor_intro',
                      '//div[@class="fleft wd740"]/div[1]/div[2]/div/p[1]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
     reg_info = response.xpath(
         '//div[@class="fleft wd740"]/div[1]/div[2]/p[4]/text()'
     ).extract_first('')
     if reg_info:
         reg_info_list = get_reg_info(reg_info)
         for each_reg_info in reg_info_list:
             reg_loader = CommonLoader2(item=DoctorRegInfoItem(),
                                        response=response)
             reg_loader.add_xpath(
                 'doctor_name',
                 '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()',
                 MapCompose(custom_remove_tags, match_special))
             reg_loader.add_xpath(
                 'dept_name',
                 '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()',
                 MapCompose(custom_remove_tags, match_special))
             reg_loader.add_value('hospital_name', self.hospital_name)
             reg_loader.add_value('reg_info', each_reg_info)
             reg_loader.add_value('update_time', now_day())
             reg_item = reg_loader.load_item()
             yield reg_item
Exemplo n.º 30
0
    def parse_hospital_dep(self, response):
        self.logger.info('>>>>>>正在抓取科室信息>>>>>>')
        try:
            hospital_name = response.meta.get('hospital_name')
            all_dept_links = response.xpath(
                '//div[@class="deptList-block mb20 clearfix"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('b/text()').extract_first('')
                dept_info = each_dept_link.xpath('ul/li/a')
                for each_dept_info in dept_info:
                    # 获取科室信息
                    dept_name = each_dept_info.xpath('@title').extract_first(
                        '')
                    dept_link = each_dept_info.xpath('@onclick').extract_first(
                        '')
                    dept_link2 = each_dept_info.xpath('@href').extract_first(
                        '')
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name)
                    dept_loader.add_value('dept_type', dept_type)
                    dept_loader.add_value('hospital_name', hospital_name)
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link:
                        res = re.search(r'goNext\((.*?),\'(.*)\'\);',
                                        dept_link)
                        if res:
                            hospital_id = res.group(1)
                            dept_id = res.group(2)
                            doctor_list_url = self.doctor_list_url.format(
                                hospital_id, dept_id)
                        else:
                            doctor_list_url = None
                    else:
                        doctor_list_url = urljoin(self.host, dept_link2)
                    if doctor_list_url:
                        self.headers['Referer'] = response.url
                        yield Request(doctor_list_url,
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      meta={
                                          'dept_name': dept_name,
                                          'hospital_name': hospital_name
                                      })

        except Exception as e:
            self.logger.error('在抓取科室信息过程中出错了,原因是:{}'.format(repr(e)))