예제 #1
0
 def parse_hospital_dep(self, response):
     """
     获取科室信息
     """
     self.logger.info('>>>>>>正在抓取科室信息……')
     hospital_name = response.xpath('//div[@class="hos-info"]/h1/text()').extract_first('')
     all_dept_links = response.xpath('//dd[@class="ks-2"]/ul/li')
     self.logger.info('{}:共有{}个科室'.format(hospital_name, str(len(all_dept_links))))
     for each_dept_link in all_dept_links:
         # 获取科室信息
         dep_loader = YiHuLoader(item=HospitalDepItem(), selector=each_dept_link)
         dep_loader.add_xpath('dept_type', 'a/text()')
         dep_loader.add_xpath('dept_name', 'a/text()')
         dep_loader.add_value('hospital_name', hospital_name)
         dep_loader.add_value('update_time', now_day())
         dep_item = dep_loader.load_item()
         yield dep_item
         # 获取科室医生信息
         dept_link = each_dept_link.xpath('a/@href').extract_first('')
         if dept_link:
             dept_link = urljoin(self.host, dept_link)
             # doctor_link = re.sub(r'/arrange/', '/7002/', dept_link)
             dept_request = Request(dept_link,
                                    headers=self.headers,
                                    callback=self.parse_dept_link)
             self.headers['Referer'] = response.url
             yield dept_request
예제 #2
0
 def parse_dept_info(self, response):
     self.logger.info('>>>>>>正在抓取医院科室相关信息……')
     dept_info = json.loads(response.text)
     for each_dept in dept_info['responseData']['data']['data']['depart']:
         loader = CommonLoader(item=HospitalDepItem(), response=response)
         loader.add_value('dept_name', each_dept['deptname'])
         loader.add_value('hospital_name', dept_info['responseData']['data']['data']['hospital']['hospitalName'])
         loader.add_value('update_time', now_day())
         dept_item = loader.load_item()
         yield dept_item
         dept_id = each_dept.get('deptid', '')
         if dept_id:
             data = {
                 'key': '',
                 'deptId': str(dept_id),
                 'pageIndex': '1',
                 'pageSize': '100'
             }
             doctor_request = FormRequest(self.doctor_link,
                                          headers=self.headers,
                                          callback=self.parse_doctor_info,
                                          formdata=data,
                                          meta={'dept_id': dept_id},
                                          dont_filter=True)
             self.headers['Referer'] = 'http://www.scgh114.com/web/register/doctor'
             yield doctor_request
예제 #3
0
 def parse_hospital_dep(self, response):
     """科室信息"""
     self.logger.info('正在抓取{}:科室信息'.format(self.hospital_name))
     dep_type = response.xpath('//div[@class="part"]/div[@class="part01"]')
     for each_dep_type in dep_type:
         dep_type = each_dep_type.xpath('div/div[1]/text()').extract_first('')
         all_dept_names = each_dep_type.xpath('ul/li')
         self.logger.info('总共有{}科室'.format(str(len(all_dept_names))))
         for each_dep_name in all_dept_names:
             loader = MedicalMapLoader(item=HospitalDepItem(), selector=each_dep_name)
             loader.add_value('dept_type', dep_type)
             dep_link = each_dep_name.xpath('div/a[1]/@href').extract_first('')
             dep_doctor_link = each_dep_name.xpath('div/a[2]/@href').extract_first('')
             loader.add_value('hospital_name', self.hospital_name)
             loader.add_xpath('dept_name', 'h3/text()')
             if dep_link:
                 dep_link = urljoin(self.host, dep_link)
                 self.headers['Referer'] = dep_link
                 dep_detail_link = dep_link.replace('sectionshow', 'classsysdetail')
                 yield Request(dep_detail_link,
                               headers=self.headers,
                               callback=self.parse_dep_detail,
                               meta={'loader': loader})
             if dep_doctor_link:
                 dep_doctor_link = urljoin(self.host, dep_doctor_link)
                 yield Request(dep_doctor_link,
                               headers=self.headers,
                               callback=self.parse_doctor_info)
예제 #4
0
    def parse_hospital_dep_detail(self, response):
        self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
            self.hospital_name))
        loader = CommonLoader2(item=HospitalDepItem(), response=response)
        loader.add_xpath('dept_type', '//div[@class="title"]/h3/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath('dept_name', '//div[@class="title"]/h3/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_name', self.hospital_name)
        # loader.add_value('dept_type', response.meta['dept_type'], MapCompose(custom_remove_tags))
        loader.add_xpath('dept_info', '//div[@class="content"]',
                         MapCompose(remove_tags, custom_remove_tags))
        loader.add_value('update_time', now_day())
        dept_item = loader.load_item()
        yield dept_item
        # 其他科室信息
        self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name))
        dept_links = response.xpath(
            '//ul[@class="list2"]/li[position()>1]/a/@href').extract()
        self.dept_crawled_cnt += 1

        if dept_links and self.dept_crawled_cnt == 1:
            for each_dept_link in dept_links:
                dept_request = Request(urljoin(self.host, each_dept_link),
                                       headers=self.headers,
                                       callback=self.parse_hospital_dep_detail,
                                       dont_filter=True)
                dept_request.meta['Referer'] = response.url
                yield dept_request
예제 #5
0
    def parse_hospital_dep(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name))
        try:
            all_dept_links = response.xpath(
                '//div[@id="one_2"]/div/div/table/tbody/tr/td[@class="contentTd"]/a'
            )
            for each_dept_link in all_dept_links:
                dept_name = each_dept_link.xpath('text()').extract_first('')
                dept_detail_link = each_dept_link.xpath('@href').extract_first(
                    '')
                dept_loader = CommonLoader2(item=HospitalDepItem(),
                                            response=response)
                dept_loader.add_value('dept_name', dept_name,
                                      MapCompose(custom_remove_tags))
                dept_loader.add_value('hospital_name', hospital_name,
                                      MapCompose(custom_remove_tags))
                dept_loader.add_value('dataSource_from', self.data_source_from)
                dept_loader.add_value('update_time', now_day())

                # 获取科室详细信息
                if dept_name and dept_detail_link:
                    self.headers['Referer'] = response.url
                    yield Request(urljoin(self.host, dept_detail_link),
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep_detail,
                                  meta={
                                      'dept_name': dept_name,
                                      'dept_loader': dept_loader,
                                      'hospital_name': hospital_name
                                  },
                                  dont_filter=True)
        except Exception as e:
            self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #6
0
 def parse_dept_info(self, response):
     dep_type = response.meta['dep_type']
     self.logger.info('正在抓取[{}]科室信息'.format(custom_remove_tags(dep_type)))
     all_dept_names = response.xpath('//div[@class="pic"]')
     if all_dept_names:
         # 一级科室有二级科室
         for each_dept_name in all_dept_names:
             dept_detail_link = each_dept_name.xpath(
                 'a/@href').extract_first('')
             if dept_detail_link:
                 dept_detail_link = urljoin(self.host, dept_detail_link)
                 request = Request(dept_detail_link,
                                   headers=self.headers,
                                   callback=self.parse_dept_detail,
                                   meta={'dep_type': dep_type})
                 request.meta['Referer'] = response.url
                 yield request
     else:
         # 一级科室没有二级科室
         loader = PxfybjyLoader(item=HospitalDepItem(), response=response)
         loader.add_value('dept_type', dep_type)
         loader.add_value('hospital_name', self.hospital_name)
         loader.add_value('update_time', now_day())
         hospital_dep_item = loader.load_item()
         yield hospital_dep_item
예제 #7
0
 def parse_hospital_dep(self, response):
     hospital_name = response.meta.get('hospital_name')
     dept_type = response.meta.get('dept_type')
     self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type))
     try:
         dept_info = json.loads(response.text)
         sub_dept_list = dept_info.get('data').get('subDepList')
         for each_dept_info in sub_dept_list:
             dept_name = each_dept_info.get('name')
             dept_id = each_dept_info.get('id')
             dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
             dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags))
             dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dataSource_from', self.data_source_from)
             dept_loader.add_value('crawled_url', response.url)
             dept_loader.add_value('update_time', now_day())
             dept_loader.add_value('dept_id', dept_id)
             dept_loader.add_value('dept_url', response.url)
             dept_loader.add_value('gmt_created', now_time())
             dept_loader.add_value('gmt_modified', now_time())
             dept_item = dept_loader.load_item()
             yield dept_item
     except Exception as e:
         self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #8
0
 def parse_hospital_dep_detail(self, response):
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('dept_info', '//div[@class="fleft wd740"]')
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #9
0
 def parse_hospital_dep_detail(self, response):
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', dept_name, MapCompose(match_special2))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('dept_info',
                      '//div[@style="text-indent: 2em"]',
                      MapCompose(remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #10
0
    def parse_hospital_dep(self, response):
        self.logger.info('>>>>>>正在抓取科室信息>>>>>>')
        try:
            hospital_name = response.meta.get('hospital_name')
            all_dept_links = response.xpath(
                '//div[@class="deptList-block mb20 clearfix"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('b/text()').extract_first('')
                dept_info = each_dept_link.xpath('ul/li/a')
                for each_dept_info in dept_info:
                    # 获取科室信息
                    dept_name = each_dept_info.xpath('@title').extract_first(
                        '')
                    dept_link = each_dept_info.xpath('@onclick').extract_first(
                        '')
                    dept_link2 = each_dept_info.xpath('@href').extract_first(
                        '')
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name)
                    dept_loader.add_value('dept_type', dept_type)
                    dept_loader.add_value('hospital_name', hospital_name)
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link:
                        res = re.search(r'goNext\((.*?),\'(.*)\'\);',
                                        dept_link)
                        if res:
                            hospital_id = res.group(1)
                            dept_id = res.group(2)
                            doctor_list_url = self.doctor_list_url.format(
                                hospital_id, dept_id)
                        else:
                            doctor_list_url = None
                    else:
                        doctor_list_url = urljoin(self.host, dept_link2)
                    if doctor_list_url:
                        self.headers['Referer'] = response.url
                        yield Request(doctor_list_url,
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      meta={
                                          'dept_name': dept_name,
                                          'hospital_name': hospital_name
                                      })

        except Exception as e:
            self.logger.error('在抓取科室信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #11
0
    def parse_hospital_dep(self, response):
        self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name))
        has_more_dept = response.xpath('//div[@id="current"]/span/a[contains(text(),"更多")]/@href').extract()
        if has_more_dept:
            for each_dept_link in has_more_dept:
                dept_request = Request(urljoin(self.host, each_dept_link),
                                       headers=self.headers,
                                       callback=self.parse_hospital_dep)
                self.headers['Referer'] = response.url
                yield dept_request
        else:
            dept_detail_link = response.xpath('//div[@class="list1"]/ul/li/a[contains(text(),"科室介绍") or '
                                              'contains(text(), "简介")]/@href').extract_first('')
            dept_name1 = response.xpath('//div[@class="list1"]/ul/li[2]/a/text()').extract_first('')
            dept_name2 = response.xpath('//div[@id="current"]/a[3]/text()').extract_first('')
            dept_detail_link2 = response.xpath('//div[@class="list1"]/ul/li[2]/a/@href').extract_first('')
            if dept_detail_link:
                # 科室介绍的名称中包含科室介绍或简介
                dept_detail_request = Request(urljoin(self.host, dept_detail_link),
                                              headers=self.headers,
                                              callback=self.parse_hospital_dep_detail)
                self.headers['Referer'] = response.url
                yield dept_detail_request
            elif dept_name1 == dept_name2 and dept_detail_link2:
                # 科室介绍的名称中不包含科室介绍或简介
                dept_detail_request = Request(urljoin(self.host, dept_detail_link2),
                                              headers=self.headers,
                                              callback=self.parse_hospital_dep_detail)
                self.headers['Referer'] = response.url
                yield dept_detail_request
            else:
                # 不存在二级科室
                loader = CommonLoader2(item=HospitalDepItem(), response=response)
                loader.add_xpath('dept_type',
                                 '//div[@id="current"]/a[2]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_value('hospital_name', self.hospital_name)
                loader.add_xpath('dept_name',
                                 '//div[@id="current"]/a[2]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_value('update_time', now_day())
                dept_item = loader.load_item()
                yield dept_item

        # 抓取其他科室信息
        other_dept_links = response.xpath('//div[@id="left1"]/span[position()>1]/a/@href').extract()
        self.dept_crawled_cnt += 1
        if self.dept_crawled_cnt <= 1 and other_dept_links:
            for each_other_dept in other_dept_links:
                dept_request = Request(urljoin(self.host, each_other_dept),
                                       headers=self.headers,
                                       callback=self.parse_hospital_dep)
                self.headers['Referer'] = response.url
                yield dept_request
예제 #12
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
         self.hospital_name))
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_xpath('dept_info', '//div[@class="right-about clearfix"]',
                      MapCompose(remove_tags, clean_info))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #13
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
         self.hospital_name))
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_xpath('dept_name', '//div[@class="list-item fl"]/h1/text()',
                      MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', self.hospital_name)
     # loader.add_value('dept_type', dept_type)
     loader.add_xpath('dept_info', '//div[@class="list-item fl"]/p',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #14
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>')
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_xpath('dept_name',
                      '//div[@class="zrys"]/p/strong/text()',
                      MapCompose(custom_remove_tags))
     loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags))
     loader.add_xpath('dept_info', '//div[@class="zrys"]/dl/dd', MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('dataSource_from', self.data_source_from)
     loader.add_value('crawled_url', response.url)
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #15
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
         self.hospital_name))
     dept_type = response.meta['dept_type']
     dept_name = response.meta['dept_name']
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('dept_type', dept_type)
     loader.add_xpath('dept_info',
                      '//div[@class="rightPanel"]/p[position()>2]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #16
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
         self.hospital_name))
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_value('dept_name', response.meta['dept_name'],
                      MapCompose(custom_remove_tags))
     loader.add_value('hospital_name', self.hospital_name)
     # loader.add_value('dept_type', dept_type)
     loader.add_xpath(
         'dept_info',
         '//div[@class="content-left pull-left departmentintro"]',
         MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #17
0
 def parse_dept_detail(self, response):
     """医院科室详细信息"""
     self.logger.info('正在抓取{}:科室详细信息'.format(self.hospital_name))
     loader = PxfybjyLoader(item=HospitalDepItem(), response=response)
     dept_name = response.xpath(
         '//li[@class="name1"]/text()').extract_first('')
     dept_info = response.xpath(
         '//div[@class="FrontProducts_detail02-1468396987105_htmlbreak"]/p'
     ).extract()
     loader.add_value('dept_type', response.meta['dep_type'])
     loader.add_value('dept_name', dept_name)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('dept_info', dept_info)
     loader.add_value('update_time', now_day())
     hospital_dep_item = loader.load_item()
     yield hospital_dep_item
예제 #18
0
 def parse_hospital_dep_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format(
         self.hospital_name))
     loader = CommonLoader2(item=HospitalDepItem(), response=response)
     loader.add_xpath('dept_name', '//div[@class="page_sum2_tit"]/text()',
                      MapCompose(custom_remove_tags, clean_info))
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('dept_type', response.meta['dept_type'])
     loader.add_xpath(
         'dept_info', '//div[@class="page_sum2"]/*['
         'not(contains(@class,"listsum_block2")) and'
         'not(contains(@class,"page_tit")) and'
         'not(contains(@class,"page_sum2_tit"))]',
         MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     dept_item = loader.load_item()
     yield dept_item
예제 #19
0
    def parse_hospital_dep(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name))
        try:
            all_dept = response.xpath('//table[@id="deptlist"]/tr')
            for each_dept in all_dept:
                dept_type = each_dept.xpath(
                    'td[1][@rowspan]/text()').extract_first('')
                if dept_type:
                    self.temp_dept_type = dept_type
                all_dept_td = each_dept.xpath('td[not(@rowspan)]')
                for each_dept_td in all_dept_td:
                    dept_name = each_dept_td.xpath('a/text()').extract_first(
                        '')
                    dept_detail_link = each_dept_td.xpath(
                        'a/@href').extract_first('')
                    if dept_name and dept_detail_link:
                        dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                    response=response)
                        dept_loader.add_value('dept_name', dept_name,
                                              MapCompose(custom_remove_tags))
                        dept_loader.add_value('hospital_name', hospital_name,
                                              MapCompose(custom_remove_tags))
                        dept_loader.add_value('dept_type', self.temp_dept_type,
                                              MapCompose(custom_remove_tags))
                        dept_loader.add_value('dataSource_from',
                                              self.data_source_from)
                        dept_loader.add_value('update_time', now_day())

                        # 获取科室详细信息
                        if dept_name and dept_detail_link:
                            self.headers['Referer'] = response.url
                            yield Request(
                                urljoin(self.host, dept_detail_link),
                                headers=self.headers,
                                callback=self.parse_hospital_dep_detail,
                                meta={
                                    'dept_name': dept_name,
                                    'dept_loader': dept_loader,
                                    'hospital_name': hospital_name
                                },
                                dont_filter=True)
        except Exception as e:
            self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #20
0
    def parse_hospital_dep(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name))
        try:
            all_dept_links = response.xpath('//div[@class="lab-list"]/div')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('div/a/text()').extract_first(
                    '')
                dept_info = each_dept_link.xpath('ul/li')
                for each_dept_info in dept_info:
                    dept_name = each_dept_info.xpath('a/text()').extract_first(
                        '')
                    dept_doctor_cnt = each_dept_info.xpath(
                        'span/b[1]/text()').extract_first('')
                    dept_detail_link = each_dept_info.xpath(
                        'a/@href').extract_first('')
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_xpath('hospital_name',
                                          '//div[@class="l"]/h2/text()',
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())

                    # 获取科室详细信息
                    if dept_name and dept_detail_link:
                        self.headers['Referer'] = response.url
                        yield Request(urljoin(self.host, dept_detail_link),
                                      headers=self.headers,
                                      callback=self.parse_hospital_dep_detail,
                                      meta={
                                          'dept_name': dept_name,
                                          'dept_loader': dept_loader,
                                          'dept_doctor_cnt': dept_doctor_cnt,
                                          'hospital_name': hospital_name
                                      },
                                      dont_filter=True)
        except Exception as e:
            self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #21
0
 def parse_hospital_dep_detail(self, response):
     hospital_name = response.meta.get('hospital_name')
     self.logger.info('>>>>>>正在抓取[{}]科室详细信息>>>>>>'.format(hospital_name))
     dept_type = response.meta.get('dept_type')
     dept_name = response.meta.get('dept_name')
     if dept_name and hospital_name:
         loader = CommonLoader2(item=HospitalDepItem(), response=response)
         loader.add_value('dept_name', dept_name,
                          MapCompose(custom_remove_tags))
         # loader.add_xpath('hospital_name',
         #                  '//div[@class="schedule_zi"]/p[1]/font[1]/text()',
         #                  MapCompose(custom_remove_tags))
         loader.add_value('hospital_name', hospital_name,
                          MapCompose(custom_remove_tags))
         loader.add_value('dept_type', dept_type,
                          MapCompose(custom_remove_tags))
         loader.add_xpath('dept_info', '//div[@id="schedule_jienr"]',
                          MapCompose(remove_tags, custom_remove_tags))
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('update_time', now_day())
         dept_item = loader.load_item()
         yield dept_item
예제 #22
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息和科室信息>>>>>>')
        try:
            # 获取医院信息
            hospital_type = response.meta.get('hospital_type')
            hospital_category = '{0}{1}'.format(
                hospital_type, '医院') if hospital_type else None
            hospital_info = custom_remove_tags(
                remove_tags(''.join(
                    response.xpath('//td[@class='
                                   '"title_yh14"]').extract())))
            hospital_address = get_hospital_info(hospital_info, '地址:', '电话:')
            hospital_address = hospital_address.replace(
                '查看地图', '') if hospital_address else None
            hospital_phone = get_hospital_info(hospital_info, '电话:', '官网')
            hospital_intro = get_hospital_info(hospital_info, '简介:', '$')
            hospital_intro = hospital_intro.replace(
                '...更多&gt;&gt;', '') if hospital_intro else None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//span[@class="title"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_level', '//span[@class="dj"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', hospital_category)
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '山西省')
            loader.add_xpath(
                'hospital_city',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_county',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_phone', hospital_phone,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_intro', hospital_intro,
                             MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>')
            all_dept_links = response.xpath('//tr[@class="h_bottom"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('td[1]/text()').extract_first(
                    '')
                dept_name = each_dept_link.xpath(
                    'td[2]/table/tr/td/a/text()').extract()
                for each_dept_name in dept_name:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value(
                        'dept_name', each_dept_name,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_value(
                        'dept_type', dept_type,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_xpath('hospital_name',
                                          '//span[@class="title"]/text()',
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
예제 #23
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            hospital_id = response.meta.get('hospital_id')
            data_type = response.meta.get('data_type')
            hospital_pro = response.meta.get('province_name')
            if data_type == '1':
                hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/'
                                                  'p[4]/text()').extract_first('')
                hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/'
                                                'p[3]/text()').extract_first('')
                check_phone = re.search('(\d{6,})', hospital_phone)
                if not check_phone and not hospital_address:
                    hospital_address = hospital_phone
                    hospital_phone = ''
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//span[@class="search-result-hospital-name"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_level',
                                 '//div[@class="search-result-hospital-text"]/p[2]/text()',
                                 MapCompose(custom_remove_tags, clean_info2))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//div[@class="search-result-hospital-text"]/p[5]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '1')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]')
                for each_dept in all_dept:
                    each_dept_id = each_dept.xpath('@id').extract_first('')
                    each_dept_type = each_dept.xpath('div/span/text()').extract_first('')
                    self.headers['Referer'] = response.url
                    dept_link = self.dept_url.format(hospital_id, each_dept_id)
                    yield Request(dept_link,
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_type': each_dept_type
                                  })

                # 获取医生信息
                self.headers['Referer'] = response.url
                doctor_info_link = self.doctor_url.format(hospital_id, '1')
                yield Request(doctor_info_link,
                              headers=self.headers,
                              callback=self.parse_doctor_info,
                              meta={
                                  'hospital_name': hospital_name,
                                  'hospital_id': hospital_id
                              })
            elif data_type == '2':
                hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]'
                                                  '[contains(text(),"地址")]/text()').extract_first('')
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//p[@class="hospital-private-content-tit"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_phone',
                                 '//div[@class="search-result-hospital-text"]/p[3]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//li[@id="address"]/p[3]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '2')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]')
                for each_dept in all_dept:
                    dept_id = each_dept.xpath('div/@id').extract_first('')
                    dept_name = each_dept.xpath('div/span/text()').extract_first('')
                    dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
                    dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from', self.data_source_from)
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', ''))
                    dept_loader.add_value('dept_url', response.url)
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    self.headers['Referer'] = response.url
                    doctor_info_link = self.doctor_url.format(hospital_id, '1')
                    yield Request(doctor_info_link,
                                  headers=self.headers,
                                  callback=self.parse_doctor_info,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_name': dept_name
                                  })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #24
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
        try:
            hospital_id = response.meta.get('hospital_id')
            hospital_img_url = response.xpath(
                '//div[@class="divLeft_Img"]/img/@src').extract_first('')
            hospital_img_url = urljoin(
                self.host, hospital_img_url) if hospital_img_url else ''
            hospital_address = response.xpath(
                '//li[contains(text(),"地址")]/text()').extract_first('')
            hospital_county = get_county2('中国|福建省|福建|厦门市|厦门',
                                          match_special2(hospital_address))
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath(
                'hospital_name',
                '//div[@class="divLeft_Info"]/ul/li[1]/span/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_pro', '福建省')
            loader.add_value('hospital_city', '厦门市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//li[contains(text(),"电话")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_xpath('hospital_official_website',
                             '//li[contains(text(),"官网")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_route',
                             '//li[contains(text(),"公交线路")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_img_url', hospital_img_url)
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            loader.add_value('hospital_id', hospital_id)
            hospital_item = loader.load_item()
            yield hospital_item

            # 科室信息
            all_dept_info = response.xpath(
                '//div[@class="medicineOne"]|//div[@class="medicineTwo"]')
            for each_dept_info in all_dept_info:
                dept_type = each_dept_info.xpath(
                    'div[1]/span/text()').extract_first('')
                dept_names = each_dept_info.xpath('div[2]/div[1]')
                for each_dept_name in dept_names:
                    dept_name = each_dept_name.xpath('a/text()').extract_first(
                        '')
                    dept_link = each_dept_name.xpath('a/@href').extract_first(
                        '')
                    doctor_num_of_dept = each_dept_name.xpath(
                        'span/text()').extract_first('')

                    # 获取科室人数
                    if doctor_num_of_dept:
                        dept_person_num = re.search(r'(\d+)',
                                                    doctor_num_of_dept)
                        dept_person_num = int(dept_person_num.group(
                            1)) if dept_person_num else None
                    else:
                        dept_person_num = None

                    # 获取科室详细信息
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_info = ''.join(
                        response.xpath(
                            '//p[contains(text(),"科室简介")]/ancestor::tr[1]').
                        extract())
                    dept_loader.add_value(
                        'dept_info', dept_info,
                        MapCompose(remove_tags, custom_remove_tags,
                                   match_special2))
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_link,
                                          MapCompose(match_special2))
                    dept_loader.add_value('hospital_id', hospital_id)
                    dept_loader.add_value('dept_person_num', dept_person_num)
                    dept_loader.add_value('dept_url',
                                          urljoin(self.host, dept_link))
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link and dept_person_num:
                        self.headers['Referer'] = response.url
                        yield Request(urljoin(self.host, dept_link),
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      dont_filter=True,
                                      meta={
                                          'hospital_name': hospital_name,
                                          'dept_name': dept_name,
                                      })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
예제 #25
0
    def parse_area(self, response):
        hospital_city = response.meta.get('area_city', '默认城市')
        self.logger.info('>>>>>>正在抓取[{}]医院列表……>>>>>>'.format(hospital_city))

        # 获取省市县等信息
        municipality = ['北京市', '上海市', '重庆市', '天津市']
        pro_or_city = response.xpath(
            '//table[@class="nav"]/tr/'
            'td/a[3]/text()').extract_first('').replace('医院列表', '')
        if pro_or_city:
            if pro_or_city.strip() in municipality:
                # 直辖市,包括市、区等信息
                hos_prov = ''
                hos_city = pro_or_city
                hos_county = response.xpath('//h1[@id="firstHeading"]/text()'
                                            ).extract_first('').replace(
                                                hos_city, '')
            else:
                # 非直辖市,包括省、市、县或区等信息
                hos_prov = pro_or_city
                hos_city = response.xpath('//h1[@id="firstHeading"]'
                                          '/text()').extract_first('').replace(
                                              '医院列表',
                                              '').replace(hos_prov, '')
                hos_county = ''
        else:
            hos_prov = hos_city = hos_county = None

        # 有医院最终页的医院
        # all_hospital_list = response.xpath('//div[@id="bodyContent"]/ul[3]/li/b/a/@href').extract()
        all_hospital_list2 = response.xpath(
            '//h2/span[contains(text(),"医院列表")]/'
            'following::ul[1]/li/b/a[not(contains(@href,"index"))]')
        special_hospital_list = response.xpath(
            '//h2/span[contains(text(),"医院列表")]/'
            'following::ul[1]/li/b/a[(contains(@href,"index"))]/ancestor::li[1]'
        )
        area_hos_cnt = len(all_hospital_list2) + len(special_hospital_list)
        self.logger.info('>>>>>>[{}]总共有{}家医院……>>>>>>'.format(
            hospital_city, str(area_hos_cnt)))
        self.total_hospital_cnt += area_hos_cnt
        self.crawler.signals.connect(self.output_statistics,
                                     signals.spider_closed)
        try:
            # 有医院最终页的
            for each_hospital in all_hospital_list2:
                hospital_name = each_hospital.xpath('text()').extract_first('')
                hospital_link = each_hospital.xpath('@href').extract_first('')
                self.headers['Referer'] = response.url
                yield Request(urljoin(self.host, hospital_link),
                              headers=self.headers,
                              callback=self.parse_hospital_detail,
                              meta={'hospital_name': hospital_name},
                              dont_filter=True)
            # 没有医院最终页的
            for each_special_hospital in special_hospital_list:
                hospital_name = each_special_hospital.xpath(
                    'b/a/text()').extract_first('')
                hospital_url = each_special_hospital.xpath(
                    'b/a/@href').extract_first('')
                hospital_address = each_special_hospital.xpath(
                    'ul[1]/li/b[contains(text(),'
                    '"医院地址")]/ancestor::li[1]/text()').extract_first('')
                hos_county = hos_county if hos_county else get_county(
                    hos_prov, hos_city, hospital_address)
                loader = CommonLoader2(item=HospitalInfoTestItem(),
                                       selector=each_special_hospital)
                loader.add_value('hospital_name', hospital_name)
                loader.add_xpath(
                    'hospital_level',
                    'ul[1]/li/b[contains(text(),"医院等级")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_category',
                    'ul[1]/li/b[contains(text(),"医院类型")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_addr',
                    'ul[1]/li/b[contains(text(),"医院地址")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_pro', hos_prov,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_value('hospital_city', hos_city,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_value('hospital_county', hos_county,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_phone',
                    'ul[1]/li/b[contains(text(),"联系电话")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_intro', '')
                loader.add_xpath(
                    'hospital_postcode',
                    'ul[1]/li/b[contains(text(),"邮政编码")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_email',
                    'ul[1]/li/b[contains(text(),"电子邮箱")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_website',
                    'ul[1]/li/b[contains(text(),"医院网站")]/ancestor::li[1]/'
                    'a[not(contains(@href,"http://www.a-hospital.com"))]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_fax',
                    'ul[1]/li/b[contains(text(),"传真号码")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'operation_mode',
                    'ul[1]/li/b[contains(text(),"经营方式")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_url',
                                 urljoin(self.host, hospital_url))
                loader.add_value('dataSource_from', '医学百科')
                loader.add_value('update_time', now_day())
                hospital_info_item = loader.load_item()
                yield hospital_info_item

                # 科室信息
                dept_info = each_special_hospital.xpath(
                    'ul[1]/li/b[contains(text(),"重点科室")]/ancestor::li[1]')
                all_dept_info = match_special(
                    dept_info.xpath('string(.)').extract_first(''))
                if all_dept_info:
                    for each_dept in all_dept_info.split('、'):
                        dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                    response=response)
                        dept_loader.add_value('dept_name', each_dept,
                                              MapCompose(custom_remove_tags))
                        dept_loader.add_value(
                            'hospital_name', hospital_name,
                            MapCompose(custom_remove_tags, match_special2))
                        dept_loader.add_value('update_time', now_day())
                        dept_item = dept_loader.load_item()
                        yield dept_item

                # 医院别名信息
                hospital_name2 = each_special_hospital.xpath(
                    'b/text()').extract_first('')
                if hospital_name2 and '(' in hospital_name2:
                    alias_name = re.search(r'((.*?))',
                                           custom_remove_tags(hospital_name2))
                    if alias_name:
                        for each_alias_name in alias_name.group(1).split('、'):
                            alias_loader = CommonLoader2(
                                item=HospitalAliasItem(), response=response)
                            alias_loader.add_value(
                                'hospital_name', hospital_name,
                                MapCompose(custom_remove_tags, match_special2))
                            alias_loader.add_value('hospital_alias_name',
                                                   each_alias_name)
                            alias_loader.add_value('update_time', now_day())
                            alias_item = alias_loader.load_item()
                            yield alias_item
        except Exception as e:
            self.logger.error('抓取[{}]医院列表的时候出错了,原因是:{}'.format(
                hospital_city, repr(e)))
예제 #26
0
    def parse_hospital_detail(self, response):
        hospital_name = response.meta.get('hospital_name', '默认医院')
        self.logger.info('>>>>>>正在抓取[{}]详细信息……>>>>>>'.format(hospital_name))

        # 获取省市县等信息
        municipality = ['北京市', '上海市', '重庆市', '天津市']
        pro_or_city = response.xpath(
            '//table[@class="nav"]/tr/'
            'td/a[3]/text()').extract_first('').replace('医院列表', '')
        if pro_or_city:
            if pro_or_city.strip() in municipality:
                # 直辖市,包括市、区等信息
                hos_prov = ''
                hos_city = pro_or_city
                hos_county = response.xpath(
                    '//table[@class="nav"]/tr/'
                    'td/a[4]/text()').extract_first('').replace(hos_city, '')
            else:
                # 非直辖市,包括省、市、县或区等信息
                hos_prov = pro_or_city
                hos_city = response.xpath(
                    '//table[@class="nav"]/tr/td/'
                    'a[4]/text()').extract_first('').replace('医院列表',
                                                             '').replace(
                                                                 hos_prov, '')
                hos_county = response.xpath(
                    '//table[@class="nav"]/tr/'
                    'td/a[5]/text()').extract_first('').replace(hos_city, '')
        else:
            hos_prov = hos_city = hos_county = None

        # 获取医院概况
        hospital_intro = response.xpath(
            '//h2/span[contains(text(),"概况")]/ancestor::h2[1]/following::p')
        i = 0
        for each_hi in hospital_intro:
            i += 1
            next_tag = each_hi.xpath(
                'preceding::h2[1]/span[not(contains(text(),"概况"))]')
            if next_tag:
                i = i - 1
                hospital_intro = hospital_intro[:i].extract()
                break
        else:
            hospital_intro = hospital_intro.extract()

        # 医院信息item
        hospital_name1 = response.xpath(
            '//table[@class="nav"]/tr/td/strong/text()').extract_first('')
        hospital_name2 = response.xpath('//title/text()').extract_first('')
        hospital_name = hospital_name1 if hospital_name1 else hospital_name2
        loader = CommonLoader2(item=HospitalInfoTestItem(), response=response)
        loader.add_value('hospital_name', hospital_name,
                         MapCompose(custom_remove_tags, match_special2))
        loader.add_xpath(
            'hospital_level', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"医院等级")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_value('hospital_type', '')
        loader.add_xpath(
            'hospital_category', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"医院类型")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_xpath(
            'hospital_addr', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"医院地址")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_value('hospital_pro', hos_prov,
                         MapCompose(custom_remove_tags, match_special))
        loader.add_value('hospital_city', hos_city,
                         MapCompose(custom_remove_tags, match_special))
        loader.add_value('hospital_county', hos_county,
                         MapCompose(custom_remove_tags, match_special))
        loader.add_xpath(
            'hospital_phone', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"联系电话")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_value('hospital_intro', hospital_intro,
                         MapCompose(remove_tags, custom_remove_tags))
        loader.add_xpath(
            'hospital_postcode', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"邮政编码")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_xpath(
            'hospital_email', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"电子邮箱")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_xpath(
            'hospital_website', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"医院网站")]/ancestor::li[1]/'
            'a[not(contains(@href,"http://www.a-hospital.com"))]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_xpath(
            'hospital_fax', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"传真号码")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_xpath(
            'operation_mode', '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"经营方式")]/ancestor::li[1]',
            MapCompose(remove_tags, custom_remove_tags, match_special))
        loader.add_value('hospital_url', response.url)
        loader.add_value('dataSource_from', '医学百科')
        loader.add_value('update_time', now_day())
        hospital_info_item = loader.load_item()
        yield hospital_info_item

        # 科室信息
        dept_info = response.xpath(
            '//div[@id="bodyContent"]/ul[1]/li/'
            'b[contains(text(),"重点科室")]/ancestor::li[1]')
        all_dept_info = match_special(
            dept_info.xpath('string(.)').extract_first(''))
        if all_dept_info:
            for each_dept in all_dept_info.split('、'):
                dept_loader = CommonLoader2(item=HospitalDepItem(),
                                            response=response)
                dept_loader.add_value('dept_name', each_dept,
                                      MapCompose(custom_remove_tags))
                dept_loader.add_value(
                    'hospital_name', hospital_name,
                    MapCompose(custom_remove_tags, match_special2))
                dept_loader.add_value('update_time', now_day())
                dept_item = dept_loader.load_item()
                yield dept_item

        # 医院别名信息
        hospital_name = response.xpath(
            '//div[@id="bodyContent"]/p[1]/b/text()').extract_first('')
        hospital_name2 = response.xpath(
            '//table[@class="nav"]/tr/td/strong/text()').extract_first('')
        if hospital_name and '(' in hospital_name:
            # alias_name = get_hospital_alias(hospital_name.replace(hospital_name2, ''))
            try:
                alias_name = re.search(r'^{}((.*?))$'.format(hospital_name2),
                                       hospital_name)
                if alias_name:
                    for each_alias_name in alias_name.group(1).split('、'):
                        alias_loader = CommonLoader2(item=HospitalAliasItem(),
                                                     response=response)
                        alias_loader.add_value(
                            'hospital_name', hospital_name,
                            MapCompose(custom_remove_tags, match_special2))
                        alias_loader.add_value('hospital_alias_name',
                                               each_alias_name)
                        alias_loader.add_value('update_time', now_day())
                        alias_item = alias_loader.load_item()
                        yield alias_item
            except Exception as e:
                self.logger.error('抓取[{}]别名的时候出错了,原因是:{}'.format(
                    hospital_name, repr(e)))
예제 #27
0
 def parse_doctor_website(self, response):
     self.logger.info('>>>>>>正在抓取医生个人主页相关信息……')
     # 获取医生相关信息
     loader = YiHuLoader(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()')
     loader.add_xpath('dept_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
     loader.add_xpath('hospital_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
     loader.add_xpath('doctor_level',
                      '//div[@class="doctor-info"]/dl/dd[1]/text()')
     loader.add_xpath(
         'doctor_intro',
         '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()')
     loader.add_xpath('doctor_goodAt',
                      '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()')
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取医院相关信息
     hos_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first(
             '')
     dept_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first(
             '')
     # 抓取医院详细信息
     if hos_link:
         hos_id = re.search(r'/sc/(.*?).shtml', hos_link)
         if hos_id and hos_id.group(1) not in self.crawled_ids:
             self.crawled_ids.add(hos_id.group(1))
             hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link)
             hos_con_link = re.sub(r'/sc/', '/contact/', hos_link)
             hos_loader = YiHuLoader(item=HospitalInfoItem(),
                                     response=response)
             hos_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             hospital_detail_request = Request(
                 hos_intro_link,
                 headers=self.headers,
                 callback=self.parse_hospital_detail,
                 meta={
                     'loader': hos_loader,
                     'contact_hos_link': hos_con_link
                 })
             hospital_detail_request.meta['Referer'] = response.url
             yield hospital_detail_request
     # 存储科室信息
     if dept_link:
         dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link)
         if dept_link_id and dept_link_id.group(1) not in self.crawled_dept:
             self.crawled_dept.add(dept_link_id.group(1))
             dept_loader = YiHuLoader(item=HospitalDepItem(),
                                      response=response)
             dept_loader.add_xpath(
                 'dept_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
             dept_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             dept_loader.add_value('update_time', now_day())
             dept_item = dept_loader.load_item()
             yield dept_item
예제 #28
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>')
        try:
            # 获取医院信息

            # 获取医院等级与地区
            hospital_info = response.xpath(
                '//p[@class="yygh_box_top_p2"]').extract()
            hospital_info2 = custom_remove_tags(
                remove_tags(''.join(hospital_info)))
            hospital_level = hospital_info2.split(':')[1].replace('区域', '')
            hospital_county = hospital_info2.split(':')[2].replace('分类', '')
            if hospital_level:
                res = re.search(r'(.*等|.*级|.*合格|.*甲)(.*?)$', hospital_level)
                if res:
                    h_l = res.group(1)
                    h_c = res.group(2)
                    if h_c:
                        h_c_2 = re.sub(r'合格|医院', '', h_c)
                        if h_c_2:
                            h_c = '{0}{1}'.format(h_c_2, '医院')
                else:
                    h_l = h_c = None
            else:
                h_l = h_c = None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//p[@class="yygh_box_top_p"]/strong/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', h_c)
            loader.add_xpath(
                'hospital_addr',
                '//span[@class="yygh_box_con_dl_span1"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '')
            loader.add_value('hospital_city', '北京市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_phone',
                '//span[@class="yygh_box_con_dl_span3"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())

            # 获取医院介绍
            hospital_intro_link = response.xpath(
                '//a[contains(text(),"医院介绍")]/@href').extract_first('')
            if hospital_intro_link:
                hospital_intro_link = urljoin(self.host, hospital_intro_link)
                self.headers['Referer'] = response.url
                yield Request(hospital_intro_link,
                              headers=self.headers,
                              callback=self.parse_hospital_detail_info,
                              meta={'loader': loader})

            # 获取科室信息
            all_dept_links = response.xpath('//div[@class="kfyuks_yyksbox"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/text()').extract_first('')
                dept_info = each_dept_link.xpath(
                    'div[2]/div/ul/li/a/text()').extract()
                for each_dept_info in dept_info:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', each_dept_info,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_xpath(
                        'hospital_name',
                        '//p[@class="yygh_box_top_p"]/strong/text()',
                        MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))