def parse_doctor_info(self, response): self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name)) tr_res1 = response.xpath('//div[@class="text"]/table[1]/tbody/tr[position()>9]') tr_res2 = response.xpath('//div[@class="text"]/table[2]/tbody/tr[position()>1]') dept_name = '' try: # 获取医生信息 for each_res in tr_res1: loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) td_cnt = each_res.xpath('td/span[not(contains(text(),"全天")) ' 'and not(contains(text(),"上午")) ' 'and not(contains(text(),"下午")) ' 'and not(contains(text(),"\u3000"))]/text()') length_of_list = len(td_cnt) if '名医堂' in td_cnt: loader.add_value('dept_name', td_cnt.extract()[0]) loader.add_value('doctor_name', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 3: dept_name = td_cnt.extract()[0].replace(' ', '') loader.add_value('dept_name', dept_name) loader.add_value('doctor_name', td_cnt.extract()[1]) loader.add_value('doctor_level', td_cnt.extract()[2]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 2: loader.add_value('dept_name', dept_name) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('doctor_level', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() else: pass for each_res in tr_res2: loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) td_cnt = each_res.xpath('td[position()<4]/span/text()') length_of_list = len(td_cnt) if '名医堂' in td_cnt: loader.add_value('dept_name', td_cnt.extract()[1]) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 3: loader.add_value('dept_name', td_cnt.extract()[2]) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('doctor_level', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() else: pass except Exception as e: self.logger.error('>>>>>>抓取过程中出错了,原因是:{}>>>>>>'.format(repr(e)))
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) try: doctor_name = response.meta.get('doctor_name') dept_name = response.meta.get('dept_name') doctor_level = response.meta.get('doctor_level') doc_gt1 = remove_tags(''.join( response.xpath('//div[@class="intro_more"]').extract())) doc_gt2 = response.xpath( '//dd[contains(text(),"擅长领域")]/text()').extract_first('') doctor_good_at = doc_gt1.replace('[关闭]', '') if doc_gt1 else doc_gt2 loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags)) loader.add_xpath('doctor_intro', '//div[@class="hos-guide-box1"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value( 'doctor_goodAt', doctor_good_at, MapCompose(custom_remove_tags, match_special, clean_info2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) doctor_name = response.meta['doctor_name'] dept_name = response.meta['dept_name'] doctor_level = response.meta['doctor_level'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@id="about-right-b"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@id="about-right-b"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 医生排班信息 params = re.search(r'.*\?(.*?)$', response.url) reg_url = 'http://www.scpz120.com/ajax/Doctor.aspx?' if params: reg_link = '{0}{1}'.format(reg_url, params.group(1).replace('&id', '&kid')) reg_request = Request(reg_link, headers=self.headers, callback=self.parse_doctor_reg_info, meta={ 'doctor_name': doctor_name, 'dept_name': dept_name }) self.headers['Referer'] = response.url yield reg_request
def parse_doctor_info(self, response): # dept_id = response.meta['dept_id'] doctor_info = json.loads(response.text) # page_index = doctor_info[0].get('pageIndex', '1') # total_page = doctor_info[0].get('totalpage', 1) for each_doctor in doctor_info[0]['data']: doctor_name = each_doctor.get('doctorName', '') loader = CommonLoader(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', each_doctor.get('deptName', '')) loader.add_value('hospital_name', each_doctor.get('hospitalName', '')) loader.add_value('doctor_level', each_doctor.get('degree', '')) loader.add_value('doctor_goodAt', each_doctor.get('extexperts', '')) doctor_id = each_doctor.get('doctorId', '') # 获取医生排班信息以及医生详细信息 if doctor_id: self.headers['Referer'] = 'http://www.scgh114.com/web/hospital/doctorinfoP' # 医生详细信息 doctor_detail_request = FormRequest(self.doctor_detail_link, headers=self.headers, callback=self.parse_doctor_detail, formdata={'doctorId': str(doctor_id)}, meta={'loader': loader}, dont_filter=True) yield doctor_detail_request # 医生排班信息 doctor_reg_request = FormRequest(self.doctor_reg_info_lik, headers=self.headers, callback=self.parse_doctor_reg_info, formdata={'doctorId': str(doctor_id)}, meta={'doctor_name': doctor_name}, dont_filter=True) yield doctor_reg_request
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) dept_name = response.meta.get('dept_name') doctor_level = response.meta.get('doctor_level') doctor_name = response.meta.get('doctor_name') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) # loader.add_xpath('doctor_name', '//span[@class="info-name"]/text()', MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name) # loader.add_xpath('hospital_name', # '//div[@class="item gray"]/span[1]/a/text()', # MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level) loader.add_xpath( 'doctor_intro', '//div[@class="info-main"]/div[3]/span', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'doctor_goodAt', '//div[@class="info-main"]/div[4]/span', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取医生详细信息>>>>>>') try: doctor_name = response.meta.get('doctor_name') dept_name = response.meta.get('dept_name') doctor_level = response.meta.get('doctor_level') hospital_name = response.meta.get('hospital_name') # hospital_name2 = response.xpath('//div[@class="yy_til"]/h2/text()').extract_first('') # hospital_name = hospital_name2 if hospital_name2 else hospital_name1 diagnosis_amt = response.xpath( '//td/span[@class="doc_yuyue_time"]/a/@title').extract() if diagnosis_amt: res = re.search(r'.*挂号费:(.*?)$', diagnosis_amt[0], S) if res: diagnosis_amt = res.group(1) else: diagnosis_amt = None else: diagnosis_amt = None loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special2)) loader.add_xpath( 'doctor_intro', '//div[@class="zrys"]/dl/dd', MapCompose(remove_tags, custom_remove_tags, clean_info2)) loader.add_value('diagnosis_amt', diagnosis_amt) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 # has_reg_info = response.xpath('//td/span[@class="doc_yuyue_time"]').extract() # if has_reg_info: # for each_reg_info in has_reg_info: # reg_info_date = re.search(r'.*出诊时间:(.*?)\n', each_reg_info, S) # reg_info_date = reg_info_date.group(1) if reg_info_date else None # reg_info = '{0}-{1}'.format(now_year(), reg_info_date).replace('月', '-').replace('日', '') # reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) # reg_loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) # reg_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) # reg_loader.add_xpath('hospital_name', # '//div[@class="yy_til"]/h2/text()', # MapCompose(custom_remove_tags)) # reg_loader.add_value('reg_info', reg_info, MapCompose(custom_remove_tags)) # reg_loader.add_value('update_time', now_day()) # reg_item = reg_loader.load_item() # yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="viewexpert_demo"]/p[2]/text()', MapCompose(custom_remove_tags, match_special, match_special2)) loader.add_xpath('doctor_intro', '//div[@class="viewexpert_detail"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@class="viewexpert_demo"]/p[4]/text()', MapCompose(custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 reg_tr_list = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]') is_has_reg = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]/td/img') reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日'] if is_has_reg: for each_td in reg_tr_list: i = 0 reg_time = each_td.xpath('td[1]/text()').extract_first('') all_reg_info = each_td.xpath('td[position()>1]') for each_reg_info in all_reg_info: reg_info_date = reg_date[i] i += 1 has_reg = each_reg_info.xpath('img') if has_reg: reg_info = '{0}{1}'.format(reg_info_date, reg_time) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) reg_loader.add_xpath( 'dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_website(self, response): self.logger.info('>>>>>>正在抓取医生个人主页相关信息……') # 获取医生相关信息 loader = YiHuLoader(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()') loader.add_xpath('dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') loader.add_xpath('hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') loader.add_xpath('doctor_level', '//div[@class="doctor-info"]/dl/dd[1]/text()') loader.add_xpath('doctor_intro', '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()') loader.add_xpath('doctor_goodAt', '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()') loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse_doctor_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院医生详细信息>>>>>>'.format(hospital_name)) try: # 获取医生信息 hospital_id = response.meta.get('hospital_id') doctor_info = json.loads(response.text) doctor_info_pages = doctor_info.get('data').get('pages') doctor_info_list = doctor_info.get('data').get('doctorPageList') current_page_num = re.search(r'&curr=(\d+)$', response.url) for each_doctor_info in doctor_info_list: portrait = each_doctor_info.get('portrait') doctor_photo_url = urljoin(self.doctor_image_host, portrait) if portrait else '' loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', each_doctor_info.get('name'), MapCompose(custom_remove_tags)) loader.add_value('dept_name', each_doctor_info.get('departmentName')) loader.add_value('hospital_name', each_doctor_info.get('hospitalName')) loader.add_value('doctor_level', each_doctor_info.get('doctorTitleName')) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_value('doctor_id', each_doctor_info.get('id')) loader.add_value('hospital_id', hospital_id) loader.add_value('doctor_photo_url', doctor_photo_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) doctor_item = loader.load_item() yield doctor_item # 医生翻页 if doctor_info_pages and current_page_num: current_page_num = int(current_page_num.group(1)) total_pages = int(doctor_info_pages) next_page = current_page_num + 1 if next_page <= total_pages: next_doctor_url = self.doctor_url.format(str(hospital_id), str(next_page)) yield Request(next_doctor_url, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id }) except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(self.hospital_name)) dept_name = response.meta['dept_name'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//div[@id="info_title"]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) # loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@id="info_txt"]', MapCompose(remove_tags, custom_remove_tags, filter_info2)) loader.add_xpath('doctor_goodAt', '//div[@id="info_txt"]', MapCompose(remove_tags, custom_remove_tags, filter_info)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) doctor_name = response.meta['doctor_name'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_xpath('dept_name', '//div[@class="doctor"]/h1/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('doctor_level', '//p[@class="profession"]/span/text()', MapCompose(custom_remove_tags)) loader.add_xpath('doctor_intro', '//div[@class="abstract"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@class="specialty"]/p/text()', MapCompose(custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取:医生详细信息>>>>>>') try: diagnosis_fee = response.meta.get('diagnosis_fee') doctor_info = custom_remove_tags( remove_tags(''.join( response.xpath('//td[@class="bk ' 'titletxt11"]').extract()))) doctor_intro1 = get_hospital_info(doctor_info, '个人简介:', '荣誉集锦:') doctor_intro2 = get_hospital_info(doctor_info, '个人简介:', '出诊时间:') doctor_intro = doctor_intro2 if doctor_intro2 else doctor_intro1 loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//table[@id="m_jkzs"]/tr/td[1]/a[last()]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'dept_name', '//table[@id="m_jkzs"]/tr/td[1]/a[last()-1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_name', '//table[@id="m_jkzs"]/tr/td[1]/a[last()-2]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'doctor_level', '//span[@class="selecttxt"][contains(text(),"医师") or contains(text(),"专家")]/text()', MapCompose(custom_remove_tags)) loader.add_value('doctor_intro', doctor_intro, MapCompose(custom_remove_tags)) loader.add_xpath( 'doctor_goodAt', '//span[@class="titletxt11"]/b[contains(text(),"擅长")]/ancestor::span[1]/text()', MapCompose(remove_tags, custom_remove_tags)) loader.add_value( 'diagnosis_amt', diagnosis_fee, MapCompose(remove_tags, custom_remove_tags, get_number)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info(self, response): """医生信息""" self.logger.info('正在抓取{}:医生信息'.format(self.hospital_name)) doctor_list = response.xpath('//div[@class="contents2"]/ul/li') self.logger.info('该科室的医生个数为:{}'.format(str(len(doctor_list)))) for each_doc in doctor_list: loader = MedicalMapLoader(item=DoctorInfoItem(), selector=each_doc) loader.add_xpath('doctor_name', 'h4[@class="name"]/text()') loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_name', 'p[@class="office"]/text()') loader.add_xpath('doctor_level', 'p[@class="post"]/text()') doctor_link = each_doc.xpath('a[1]/@href').extract_first('') if doctor_link: doctor_link = urljoin(self.host, doctor_link) self.headers['Referer'] = response.url yield Request(doctor_link, headers=self.headers, callback=self.parse_doctor_detail, meta={'loader': loader})
def parse_doctor_detail(self, response): loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="fleft wd740"]/div[1]/div[2]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@class="fleft wd740"]/div[1]/div[2]/div/p[1]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item reg_info = response.xpath( '//div[@class="fleft wd740"]/div[1]/div[2]/p[4]/text()' ).extract_first('') if reg_info: reg_info_list = get_reg_info(reg_info) for each_reg_info in reg_info_list: reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', each_reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) dept_name = response.meta['dept_name'] doctor_name = response.meta['doctor_name'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('doctor_intro', '//div[@class="right-about clearfix"]', MapCompose(remove_tags, get_doctor_intro2)) loader.add_xpath('doctor_goodAt', '//div[@class="right-about clearfix"]', MapCompose(remove_tags, get_doctor_good_at)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取排班信息 self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format( self.hospital_name)) reg_info = response.xpath( '//div[@class="right-about clearfix"]/p[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/span/strong[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"上午")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"下午")]/text()' '|//div[@class="right-about clearfix"]/p/strong/span[contains(text(),"坐诊时间")]/text()' ).extract_first('') if reg_info: reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name) reg_loader.add_value('dept_name', dept_name) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info, MapCompose(match_special, clean_info)) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取:医生详细信息>>>>>>') try: doctor_name = response.meta.get('doctor_name') hospital_name = response.meta.get('hospital_name') doctor_level = response.meta.get('doctor_level') dept_name = response.meta.get('dept_name') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', hospital_name) loader.add_value('doctor_level', doctor_level) loader.add_xpath('doctor_intro', '//p[@id="docSpeciality"]/text()', MapCompose(custom_remove_tags, match_special)) # loader.add_value('doctor_goodAt', '') # loader.add_value('diagnosis_amt', '') loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info(self, response): """获取医院医生信息""" self.logger.info('正在抓取{}:医生信息'.format(self.hospital_name)) all_doctors = response.xpath('//div[@class="pic"]') dept_name = response.xpath( '//div[@id="FrontPublic_breadCrumb01-1468402139239"]/div/' 'a[4]/text()').extract_first('') if all_doctors: # 科室有医生 for each_dept_name in all_doctors: dept_detail_link = each_dept_name.xpath( 'a/@href').extract_first('') loader = PxfybjyLoader(item=DoctorInfoItem(), response=response) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) if dept_detail_link: dept_detail_link = urljoin(self.host, dept_detail_link) request = Request(dept_detail_link, headers=self.headers, callback=self.parse_doctor_detail, meta={'loader': loader}) request.meta['Referer'] = response.url yield request
def parse_doctor_reg_info(self, response): self.logger.info('>>>>>>正在抓取医生排班信息……') loader = YiHuLoader(item=DoctorInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="link-555"]/a/text()') loader.add_xpath('dept_name', '//div[@class="hos-info"]/h1/text()')
def parse_doctor_info(self, response): self.logger.info('>>>>>>正在抓取:医生信息>>>>>>') try: dept_name = response.meta.get('dept_name') hospital_name = response.meta.get('hospital_name') all_doctors = response.xpath('//div[@class="docInfo docInfo-w-h"]') for each_doctor_link in all_doctors: doctor_link = each_doctor_link.xpath( 'p[1]/a/@href').extract_first('') doctor_name = each_doctor_link.xpath( 'p[1]/a/text()').extract_first('') doctor_level = each_doctor_link.xpath( 'p[2]/text()').extract_first('') doctor_intro = each_doctor_link.xpath( 'p[contains(text(),"简介")]/text()').extract_first('') doctor_intro = match_special(doctor_intro) if doctor_intro: if doctor_link: doctor_link = urljoin(self.host, doctor_link) self.headers['Referer'] = response.url yield Request(doctor_link, headers=self.headers, callback=self.parse_doctor_info_detail, dont_filter=True, meta={ 'doctor_name': doctor_name, 'doctor_level': doctor_level, 'dept_name': dept_name, 'hospital_name': hospital_name }) else: loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', hospital_name) loader.add_value('doctor_level', doctor_level) # loader.add_value('doctor_intro', '') # loader.add_xpath('doctor_goodAt', '') # loader.add_value('diagnosis_amt', '') loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 分页 has_next = response.xpath('//a[@class="pb_next"]') if has_next: # hos_id = re.search(r'HIS_CD=(.*?)&', response.url) # dept_id = re.search(r'DEP_ID=(.*?)$', response.url) hos_id = response.xpath( '//input[@name="HIS_CD"]/@value').extract_first('') dept_id = response.xpath( '//input[@name="DEP_ID"]/@value').extract_first('') if hos_id and dept_id: # hos_id = hos_id.group(1) # dept_id = dept_id.group(1) now_page = response.xpath( '//a[@class="pb_on"]/text()').extract_first('') total_page = response.xpath( '//a[contains(text(),"尾页")]/@pagval').extract_first('') total_doctor_num = response.xpath( '//input[@name="TOT_REC_NUM"]/@value').extract_first( '') if now_page and total_page and total_doctor_num: next_page_num = int(now_page) + 1 total_page_num = int(total_page) if next_page_num <= total_page_num: next_page_link = self.next_doctor_url.format( str(next_page_num), total_page, total_doctor_num, hos_id, dept_id, timestamp()) self.headers['Referer'] = response.url yield Request(next_page_link, headers=self.headers, callback=self.parse_doctor_info, dont_filter=True, meta={ 'dept_name': dept_name, 'hospital_name': hospital_name }) except Exception as e: self.logger.error('在抓取医生信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') dept_name = response.meta.get('dept_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) try: # 获取医生信息 loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_name', '//div[@class="page_position"]/a[last()-1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'sex', '//td/b[contains(text(),"性别")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special, clean_info2)) loader.add_xpath( 'doctor_level', '//td/b[contains(text(),"职称")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special, clean_info2)) loader.add_xpath( 'doctor_intro', '//td/b[contains(text(),"医生简介")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, clean_info2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 self.logger.info( '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name)) has_doctor_scheduling = response.xpath( '//td/div[@class="doctor-work"]') if has_doctor_scheduling: doctor_scheduling_tr = response.xpath( '//table[@class="workTable"]/tbody/tr') all_scheduling_date = response.xpath( '//table[@class="workTable"]/thead/tr/td[position()>1]' ).extract() scheduling_date_list = custom_remove_tags( remove_tags(','.join(all_scheduling_date))).split(',') for each_td in doctor_scheduling_tr: scheduling_time = each_td.xpath( 'td[1]/text()').extract_first('') scheduling_info = each_td.xpath('td[position()>1]') for index, each_s_i in enumerate(scheduling_info): has_scheduling = each_s_i.xpath('div') if has_scheduling: each_scheduling_date = scheduling_date_list[index][ 0:3] reg_info = '{0}{1}'.format(each_scheduling_date, scheduling_time) reg_loader = CommonLoader2( item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_xpath( 'hospital_name', '//div[@class="page_position"]/a[last()-1]/text()', MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') dept_name = response.meta.get('dept_name') doctor_name = response.meta.get('doctor_name') self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format( hospital_name, doctor_name)) try: # 获取医生信息 doctor_photo_url = response.xpath( '//div[@class="doctor_Img"]/img/@src').extract_first('') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('doctor_level', '//span[@class="object_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'doctor_intro', '//div[@class="doctor_Text_Major"]', MapCompose(remove_tags, custom_remove_tags, match_special2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_value('doctor_id', response.url, MapCompose(match_special2)) loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) loader.add_value('doctor_photo_url', urljoin(self.host, doctor_photo_url)) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 self.logger.info( '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name)) has_doctor_scheduling = response.xpath( '//span[@class="yuyue"]/a[contains(text(),"预约")]') if has_doctor_scheduling: doctor_scheduling_list = response.xpath( '//div[@class="whliesubscribe"]/ul/li[1]/div/' 'span/text()').extract() doctor_scheduling_length = len(doctor_scheduling_list) all_scheduling_date = response.xpath( '//div[@class="datetable"]/ul/li[position()>1]/' 'span[1]/text()').extract() scheduling_date_list = custom_remove_tags( remove_tags(','.join(all_scheduling_date))).split(',') for i in range(1, doctor_scheduling_length + 1): scheduling_info = response.xpath( '//div[@class="whliesubscribe"]/ul/li[position()>1]' '/div[{}]'.format(str(i))) scheduling_time = doctor_scheduling_list[i - 1] for index, each_s_i in enumerate(scheduling_info): has_scheduling = each_s_i.xpath('span/a') if has_scheduling: each_scheduling_date = scheduling_date_list[index] reg_info = '{0}-{1}{2}'.format( now_year(), each_scheduling_date, scheduling_time) reg_loader = CommonLoader2( item=DoctorRegInfoItem(), response=response) reg_loader.add_value( 'doctor_name', doctor_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_loader.add_value('doctor_id', response.url, MapCompose(match_special2)) reg_loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) reg_loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) reg_loader.add_value('gmt_created', now_time()) reg_loader.add_value('gmt_modified', now_time()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info(self, response): self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name)) doctor_links = response.xpath('//li[@class="content column-num3"]') for each_doctor_link in doctor_links: doctor_level = dept_name = each_doctor_link.xpath( 'div[2]/ul/li[2]/text()').extract() doctor_link = each_doctor_link.xpath( 'div[1]/div/a/@href').extract_first('') loader = CommonLoader2(item=DoctorInfoItem(), selector=each_doctor_link) loader.add_xpath('doctor_name', 'div[1]/div/a/@title', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) if doctor_link: doctor_detail_request = Request( urljoin(self.host, doctor_link), headers=self.headers, callback=self.parse_doctor_info_detail, dont_filter=True, meta={ 'loader': loader, 'dept_name': dept_name, 'doctor_level': doctor_level }) self.headers['Referer'] = response.url yield doctor_detail_request # 医生信息下一页 dept_id = re.search(r'.*pmcId=(.*?).html$', response.url) dept_id_2 = re.search(r'.*pmcId=(.*?)&pageNo_FrontProducts.*', response.url) # 获取科室id if dept_id_2: dept_id = dept_id_2.group(1) elif dept_id: dept_id = dept_id.group(1) else: dept_id = '' # 获取页码 page_no = response.xpath( '//a[contains(text(),"下一页")]/@onclick').extract_first('') if page_no: page_no = re.search(r'\((.*?)\)', page_no) if page_no: page_no = page_no.group(1).split(',')[0] next_page = 'http://www.slbjy.cn/expert_list/pmcId={}&pageNo_FrontProducts_list01-1482202374862={}' \ '&pageSize_FrontProducts_list01-1482202374862=12.html' next_page_link = next_page.format(dept_id, page_no) next_request = Request(next_page_link, headers=self.headers, callback=self.parse_doctor_info) self.headers['Referer'] = response.url yield next_request # 其他科室医生信息 self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name)) doctor_links = response.xpath( '//div[@class="menu-first"]/ul/li[position()>1]/a/@href').extract( ) self.doctor_crawled_cnt += 1 if doctor_links and self.doctor_crawled_cnt == 1: for each_doctor_link in doctor_links: doctor_request = Request(urljoin(self.host, each_doctor_link), headers=self.headers, callback=self.parse_doctor_info, dont_filter=True) self.headers['Referer'] = response.url yield doctor_request
def parse_doctor_website(self, response): self.logger.info('>>>>>>正在抓取医生个人主页相关信息……') # 获取医生相关信息 loader = YiHuLoader(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()') loader.add_xpath('dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') loader.add_xpath('hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') loader.add_xpath('doctor_level', '//div[@class="doctor-info"]/dl/dd[1]/text()') loader.add_xpath( 'doctor_intro', '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()') loader.add_xpath('doctor_goodAt', '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()') loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医院相关信息 hos_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first( '') dept_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first( '') # 抓取医院详细信息 if hos_link: hos_id = re.search(r'/sc/(.*?).shtml', hos_link) if hos_id and hos_id.group(1) not in self.crawled_ids: self.crawled_ids.add(hos_id.group(1)) hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link) hos_con_link = re.sub(r'/sc/', '/contact/', hos_link) hos_loader = YiHuLoader(item=HospitalInfoItem(), response=response) hos_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') hospital_detail_request = Request( hos_intro_link, headers=self.headers, callback=self.parse_hospital_detail, meta={ 'loader': hos_loader, 'contact_hos_link': hos_con_link }) hospital_detail_request.meta['Referer'] = response.url yield hospital_detail_request # 存储科室信息 if dept_link: dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link) if dept_link_id and dept_link_id.group(1) not in self.crawled_dept: self.crawled_dept.add(dept_link_id.group(1)) dept_loader = YiHuLoader(item=HospitalDepItem(), response=response) dept_loader.add_xpath( 'dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') dept_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item