def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') dept_type = response.meta.get('dept_type') self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type)) try: dept_info = json.loads(response.text) sub_dept_list = dept_info.get('data').get('subDepList') for each_dept_info in sub_dept_list: dept_name = each_dept_info.get('name') dept_id = each_dept_info.get('id') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院医生详细信息>>>>>>'.format(hospital_name)) try: # 获取医生信息 hospital_id = response.meta.get('hospital_id') doctor_info = json.loads(response.text) doctor_info_pages = doctor_info.get('data').get('pages') doctor_info_list = doctor_info.get('data').get('doctorPageList') current_page_num = re.search(r'&curr=(\d+)$', response.url) for each_doctor_info in doctor_info_list: portrait = each_doctor_info.get('portrait') doctor_photo_url = urljoin(self.doctor_image_host, portrait) if portrait else '' loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', each_doctor_info.get('name'), MapCompose(custom_remove_tags)) loader.add_value('dept_name', each_doctor_info.get('departmentName')) loader.add_value('hospital_name', each_doctor_info.get('hospitalName')) loader.add_value('doctor_level', each_doctor_info.get('doctorTitleName')) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_value('doctor_id', each_doctor_info.get('id')) loader.add_value('hospital_id', hospital_id) loader.add_value('doctor_photo_url', doctor_photo_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) doctor_item = loader.load_item() yield doctor_item # 医生翻页 if doctor_info_pages and current_page_num: current_page_num = int(current_page_num.group(1)) total_pages = int(doctor_info_pages) next_page = current_page_num + 1 if next_page <= total_pages: next_doctor_url = self.doctor_url.format(str(hospital_id), str(next_page)) yield Request(next_doctor_url, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id }) except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_id = response.meta.get('hospital_id') hospital_img_url = response.xpath( '//div[@class="divLeft_Img"]/img/@src').extract_first('') hospital_img_url = urljoin( self.host, hospital_img_url) if hospital_img_url else '' hospital_address = response.xpath( '//li[contains(text(),"地址")]/text()').extract_first('') hospital_county = get_county2('中国|福建省|福建|厦门市|厦门', match_special2(hospital_address)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//div[@class="divLeft_Info"]/ul/li[1]/span/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', '福建省') loader.add_value('hospital_city', '厦门市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//li[contains(text(),"电话")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_official_website', '//li[contains(text(),"官网")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_route', '//li[contains(text(),"公交线路")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_img_url', hospital_img_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 科室信息 all_dept_info = response.xpath( '//div[@class="medicineOne"]|//div[@class="medicineTwo"]') for each_dept_info in all_dept_info: dept_type = each_dept_info.xpath( 'div[1]/span/text()').extract_first('') dept_names = each_dept_info.xpath('div[2]/div[1]') for each_dept_name in dept_names: dept_name = each_dept_name.xpath('a/text()').extract_first( '') dept_link = each_dept_name.xpath('a/@href').extract_first( '') doctor_num_of_dept = each_dept_name.xpath( 'span/text()').extract_first('') # 获取科室人数 if doctor_num_of_dept: dept_person_num = re.search(r'(\d+)', doctor_num_of_dept) dept_person_num = int(dept_person_num.group( 1)) if dept_person_num else None else: dept_person_num = None # 获取科室详细信息 dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_info = ''.join( response.xpath( '//p[contains(text(),"科室简介")]/ancestor::tr[1]'). extract()) dept_loader.add_value( 'dept_info', dept_info, MapCompose(remove_tags, custom_remove_tags, match_special2)) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_link, MapCompose(match_special2)) dept_loader.add_value('hospital_id', hospital_id) dept_loader.add_value('dept_person_num', dept_person_num) dept_loader.add_value('dept_url', urljoin(self.host, dept_link)) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link and dept_person_num: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_doctor_info, dont_filter=True, meta={ 'hospital_name': hospital_name, 'dept_name': dept_name, }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name)) try: hospital_id = response.meta.get('hospital_id') data_type = response.meta.get('data_type') hospital_pro = response.meta.get('province_name') if data_type == '1': hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/' 'p[4]/text()').extract_first('') hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/' 'p[3]/text()').extract_first('') check_phone = re.search('(\d{6,})', hospital_phone) if not check_phone and not hospital_address: hospital_address = hospital_phone hospital_phone = '' # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//span[@class="search-result-hospital-name"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//div[@class="search-result-hospital-text"]/p[2]/text()', MapCompose(custom_remove_tags, clean_info2)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//div[@class="search-result-hospital-text"]/p[5]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '1') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]') for each_dept in all_dept: each_dept_id = each_dept.xpath('@id').extract_first('') each_dept_type = each_dept.xpath('div/span/text()').extract_first('') self.headers['Referer'] = response.url dept_link = self.dept_url.format(hospital_id, each_dept_id) yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_type': each_dept_type }) # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id }) elif data_type == '2': hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]' '[contains(text(),"地址")]/text()').extract_first('') # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//p[@class="hospital-private-content-tit"]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//div[@class="search-result-hospital-text"]/p[3]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//li[@id="address"]/p[3]/text()', MapCompose(custom_remove_tags, match_special2)) # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '2') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]') for each_dept in all_dept: dept_id = each_dept.xpath('div/@id').extract_first('') dept_name = each_dept.xpath('div/span/text()').extract_first('') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', '')) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_name': dept_name }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') dept_name = response.meta.get('dept_name') doctor_name = response.meta.get('doctor_name') self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format( hospital_name, doctor_name)) try: # 获取医生信息 doctor_photo_url = response.xpath( '//div[@class="doctor_Img"]/img/@src').extract_first('') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('doctor_level', '//span[@class="object_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'doctor_intro', '//div[@class="doctor_Text_Major"]', MapCompose(remove_tags, custom_remove_tags, match_special2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_value('doctor_id', response.url, MapCompose(match_special2)) loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) loader.add_value('doctor_photo_url', urljoin(self.host, doctor_photo_url)) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 self.logger.info( '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name)) has_doctor_scheduling = response.xpath( '//span[@class="yuyue"]/a[contains(text(),"预约")]') if has_doctor_scheduling: doctor_scheduling_list = response.xpath( '//div[@class="whliesubscribe"]/ul/li[1]/div/' 'span/text()').extract() doctor_scheduling_length = len(doctor_scheduling_list) all_scheduling_date = response.xpath( '//div[@class="datetable"]/ul/li[position()>1]/' 'span[1]/text()').extract() scheduling_date_list = custom_remove_tags( remove_tags(','.join(all_scheduling_date))).split(',') for i in range(1, doctor_scheduling_length + 1): scheduling_info = response.xpath( '//div[@class="whliesubscribe"]/ul/li[position()>1]' '/div[{}]'.format(str(i))) scheduling_time = doctor_scheduling_list[i - 1] for index, each_s_i in enumerate(scheduling_info): has_scheduling = each_s_i.xpath('span/a') if has_scheduling: each_scheduling_date = scheduling_date_list[index] reg_info = '{0}-{1}{2}'.format( now_year(), each_scheduling_date, scheduling_time) reg_loader = CommonLoader2( item=DoctorRegInfoItem(), response=response) reg_loader.add_value( 'doctor_name', doctor_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_loader.add_value('doctor_id', response.url, MapCompose(match_special2)) reg_loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) reg_loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) reg_loader.add_value('gmt_created', now_time()) reg_loader.add_value('gmt_modified', now_time()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))