示例#1
0
 def get_page(self, type, tab):
     """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
     Args:
         tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
     """
     url = CrawlerUtils.add_params_to_url(
         self.urls[type], {
             'entId': self.ent_id,
             'ent_id': self.ent_id,
             'entid': self.ent_id,
             'credit_ticket': self.credit_ticket,
             'entNo': self.ent_number,
             'entName': '',
             'timeStamp': self.generate_time_stamp(),
             'clear': 'true',
             'str': tab
         })
     logging.error('get %s, url:\n%s\n' % (type, url))
     resp = self.crawl_page_by_url(url)
     if resp.status_code != 200:
         logging.error('get page failed by url %s' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     return page
示例#2
0
 def get_page(self, type, tab):
     """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
     Args:
         tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
     """
     url = CrawlerUtils.add_params_to_url(
         self.urls[type], {
             'entId': self.ent_id,
             'ent_id': self.ent_id,
             'entid': self.ent_id,
             'credit_ticket': self.credit_ticket,
             'entNo': self.ent_number,
             'entName': '',
             'timeStamp': self.generate_time_stamp(),
             'clear': 'true',
             'str': tab
         })
     settings.logger.info('get %s, url:\n%s\n' % (type, url))
     resp = self.reqst.get(url)
     if resp.status_code != 200:
         settings.logger.warn('get page failed by url %s' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + type + '.html', page)
     return page
示例#3
0
    def get_detail_link(self, bs4_tag, page):
        """获取详情链接 url,在bs tag中或者page中提取详情页面
        Args:
            bs4_tag: beautifulsoup 的tag
            page: 页面数据
        """
        detail_op = bs4_tag.get('onclick')
        pat_view_info = re.compile(r'viewInfo\(\'([\w]+)\'\)')
        pat_show_dialog = re.compile(r'showDialog\(\'([^\'\n]+)\'')
        next_url = ''
        if detail_op and pat_view_info.search(detail_op):
            m = pat_view_info.search(detail_op)
            val = m.group(1)
            #detail link type 1, for example : ind_comm_pub info --- registration info -- shareholders info
            pat = re.compile(
                r'var +url += +rootPath +\+ +\"(.+\?)([\w]+)=\"\+[\w]+\+\"')
            m1 = pat.search(page)
            if m1:
                addition_url = m1.group(1)
                query_key = m1.group(2)

                next_url = CrawlerUtils.add_params_to_url(
                    self.crawler.urls['host'] + addition_url, {
                        query_key: val,
                        'entId': self.crawler.ent_id,
                        'ent_id': self.crawler.ent_id,
                        'entid': self.crawler.ent_id,
                        'credit_ticket': self.crawler.credit_ticket,
                        'entNo': self.crawler.ent_number
                    })
        elif detail_op and pat_show_dialog.search(detail_op):
            #detail link type 2, for example : ind_comm_pub_info --- registration info ---- modify info
            m = pat_show_dialog.search(detail_op)
            val = m.group(1)
            next_url = self.crawler.urls['host'] + val
        elif 'href' in bs4_tag.attrs.keys():
            #detail link type 3, for example : ent pub info ----- enterprise annual report
            next_url = self.crawler.urls['host'] + bs4_tag['href']

        return next_url