def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) logging.error('get %s, url:\n%s\n' % (type, url)) resp = self.crawl_page_by_url(url) if resp.status_code != 200: logging.error('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) return page
def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) settings.logger.info('get %s, url:\n%s\n' % (type, url)) resp = self.reqst.get(url) if resp.status_code != 200: settings.logger.warn('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + type + '.html', page) return page
def get_detail_link(self, bs4_tag, page): """获取详情链接 url,在bs tag中或者page中提取详情页面 Args: bs4_tag: beautifulsoup 的tag page: 页面数据 """ detail_op = bs4_tag.get('onclick') pat_view_info = re.compile(r'viewInfo\(\'([\w]+)\'\)') pat_show_dialog = re.compile(r'showDialog\(\'([^\'\n]+)\'') next_url = '' if detail_op and pat_view_info.search(detail_op): m = pat_view_info.search(detail_op) val = m.group(1) #detail link type 1, for example : ind_comm_pub info --- registration info -- shareholders info pat = re.compile( r'var +url += +rootPath +\+ +\"(.+\?)([\w]+)=\"\+[\w]+\+\"') m1 = pat.search(page) if m1: addition_url = m1.group(1) query_key = m1.group(2) next_url = CrawlerUtils.add_params_to_url( self.crawler.urls['host'] + addition_url, { query_key: val, 'entId': self.crawler.ent_id, 'ent_id': self.crawler.ent_id, 'entid': self.crawler.ent_id, 'credit_ticket': self.crawler.credit_ticket, 'entNo': self.crawler.ent_number }) elif detail_op and pat_show_dialog.search(detail_op): #detail link type 2, for example : ind_comm_pub_info --- registration info ---- modify info m = pat_show_dialog.search(detail_op) val = m.group(1) next_url = self.crawler.urls['host'] + val elif 'href' in bs4_tag.attrs.keys(): #detail link type 3, for example : ent pub info ----- enterprise annual report next_url = self.crawler.urls['host'] + bs4_tag['href'] return next_url