Пример #1
0
 def get_page(self, type, tab):
     """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
     Args:
         tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
     """
     url = CrawlerUtils.add_params_to_url(
         self.urls[type], {
             'entId': self.ent_id,
             'ent_id': self.ent_id,
             'entid': self.ent_id,
             'credit_ticket': self.credit_ticket,
             'entNo': self.ent_number,
             'entName': '',
             'timeStamp': self.generate_time_stamp(),
             'clear': 'true',
             'str': tab
         })
     settings.logger.info('get %s, url:\n%s\n' % (type, url))
     resp = self.reqst.get(url)
     if resp.status_code != 200:
         settings.logger.warn('get page failed by url %s' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + type + '.html', page)
     return page
Пример #2
0
 def crawl_page_by_url(self, url):
     """根据url直接爬取页面
     """
     resp = self.reqst.get(url)
     if self.reqst.status_code != 200:
         settings.logger.error('crawl page by url failed! url = %s' % url)
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + 'detail.html', page)
     return page
Пример #3
0
 def crawl_page_by_url(self, url):
     """通过url直接获取页面
     """
     resp = self.reqst.get(url, verify=False)
     if resp.status_code != 200:
         settings.logger.error('failed to crawl page by url' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + 'detail.html', page)
     return page
Пример #4
0
    def parse_ent_pub_annual_report_page(self, base_page, page_type):
        """解析企业年报页面,该页面需要单独处理
        """
        def get_year_of_annual_report(page):
            soup = BeautifulSoup(page, 'html.parser')
            t = soup.body.find('table')
            return CrawlerUtils.get_raw_text_in_bstag(t.find('tr'))

        if settings.save_html:
            CrawlerUtils.save_page_to_file(
                self.crawler.html_restore_path +
                'annual_report_base_info.html', base_page)

        page_data = {}
        soup = BeautifulSoup(base_page, 'html.parser')
        if soup.body.find('table'):
            base_table = soup.body.find('table')
            table_name = u'企业基本信息'
            page_data[table_name] = self.parse_table(base_table, table_name,
                                                     base_page)

            if len(soup.find_all('table')) > 1:
                ent_property_table = soup.body.find_all('table')[1]
                table_name = self.get_table_title(ent_property_table)
                page_data[table_name] = self.parse_table(
                    ent_property_table, table_name, base_page)
        else:
            pass

        year = get_year_of_annual_report(base_page)
        report_items = {
            'wzFrame': 'website_info',
            'gdczFrame': 'shareholder_contribute_info',
            'dwdbFrame': 'external_guarantee_info',
            'xgFrame': 'modify_record_info'
        }
        for item in report_items.items():
            pat = re.compile(
                r'<iframe +id="%s" +src=\'(/entPub/entPubAction!.+)\'' %
                item[0])
            m = pat.search(base_page)
            if m:
                next_url = self.crawler.urls['host'] + m.group(1)
                settings.logger.info('get annual report, url:\n%s\n' %
                                     next_url)
                page = self.crawler.crawl_page_by_url(next_url)
                pages = self.crawler.get_all_pages_of_a_section(
                    page, page_type, next_url)

                table_name = item[1]
                try:
                    soup = BeautifulSoup(page, 'html.parser')
                    table_name = self.get_table_title(soup.body.table)
                except Exception as e:
                    settings.logger.error(
                        'fail to get table name with exception %s' % e)
                    raise e
                try:
                    if len(pages) == 1:
                        table_data = self.parse_page(page, table_name)
                    else:
                        table_data = []
                        for p in pages:
                            table_data += self.parse_page(p, table_name)
                except Exception as e:
                    settings.logger.error(
                        'fail to parse page with exception %s' % e)
                    raise e
                finally:
                    page_data[table_name] = table_data
        return page_data