예제 #1
0
 def get_page(self, type, tab):
     """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
     Args:
         tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
     """
     url = CrawlerUtils.add_params_to_url(
         self.urls[type], {
             'entId': self.ent_id,
             'ent_id': self.ent_id,
             'entid': self.ent_id,
             'credit_ticket': self.credit_ticket,
             'entNo': self.ent_number,
             'entName': '',
             'timeStamp': self.generate_time_stamp(),
             'clear': 'true',
             'str': tab
         })
     settings.logger.info('get %s, url:\n%s\n' % (type, url))
     resp = self.reqst.get(url)
     if resp.status_code != 200:
         settings.logger.warn('get page failed by url %s' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + type + '.html', page)
     return page
예제 #2
0
 def test_parse_shareholder_detail_page(self):
     with open('./enterprise_crawler/zongju/shareholder_detail.html') as f:
         page = f.read()
         result = self.parser.parse_ind_comm_pub_shareholder_detail_page(
             page)
         CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path,
                                        {self.crawler.ent_number: result})
예제 #3
0
def get_pdfs_from_data_json(abs_pdf_restore_dir, json_file_name):
    f = open(json_file_name, 'r')
    for line in f.readlines():
        list_dict = json.loads(line)['list']
        for i, item in enumerate(list_dict):
            # print i,'---------'
            # print item
            pdf_url = item['pdf_url']
            count = 0
            resp = None
            while count < 10:
                resp = reqst.get(pdf_url)
                if resp.status_code == 200 and resp.content:
                    with open(
                            '%s/%s' %
                        (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]),
                            'wb') as f:
                        f.write(resp.content)
                    break
                else:
                    count += 1
                    if count == 10:
                        print '%s, get_error_pdf' % pdf_url
                    continue
            if count != 10:
                list_dict[i]['abs_path'] = '%s/%s' % (abs_pdf_restore_dir,
                                                      pdf_url.rsplit('/')[-1])
        # print list_dict
        CrawlerUtils.json_dump_to_file(
            '%s%s%s' % (json_file_name[:-5], '_insert', json_file_name[-5:]),
            {'list': list_dict})
    f.close()
예제 #4
0
    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.result_json_dict = {}
        self.id = self.get_id_num(findCode)
        print self.id
        resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_one(self.one_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_two(self.two_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_three(self.three_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_four(self.four_dict, soup.find_all('table'))

        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
예제 #5
0
 def crawl_page_by_url(self, url):
     """根据url直接爬取页面
     """
     resp = self.reqst.get(url)
     if self.reqst.status_code != 200:
         settings.logger.error('crawl page by url failed! url = %s' % url)
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + 'detail.html', page)
     return page
예제 #6
0
 def get_dict_table_items(self, table_tag):
     """获得字典类型的表格的结构
     """
     table_items = {}
     for tr in table_tag.find_all('tr'):
         if tr.find('th') and tr.find('td'):
             ths = tr.find_all('th')
             tds = tr.find_all('td')
             for index, td in enumerate(tds):
                 table_items[CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace('PAPERS_', '')]\
                     = CrawlerUtils.get_raw_text_in_bstag(ths[index])
     return table_items
예제 #7
0
 def crawl_page_by_url(self, url):
     """通过url直接获取页面
     """
     resp = self.reqst.get(url, verify=False)
     if resp.status_code != 200:
         settings.logger.error('failed to crawl page by url' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     if settings.save_html:
         CrawlerUtils.save_page_to_file(
             self.html_restore_path + 'detail.html', page)
     return page
예제 #8
0
    def parse_ind_comm_pub_reg_modify_table(self, bs_table, table_name, page):
        """解析工商公示信息-注册信息-变更信息表格,由于含有详情页,需要单独处理
        """
        tbody = bs_table.find('tbody')
        if tbody:
            columns = self.get_columns_of_record_table(bs_table, page)
            column_size = len(columns)
            item_array = []

            for tr in tbody.find_all('tr'):
                if tr.find('td'):
                    col_count = 0
                    item = {}
                    for td in tr.find_all('td'):
                        if td.find('a'):
                            #try to retrieve detail link from page
                            next_url = self.get_detail_link(td.find('a'), page)
                            #has detail link
                            if next_url:
                                detail_page = self.crawler.crawl_page_by_url(
                                    next_url).content
                                detail_soup = BeautifulSoup(
                                    detail_page, 'html.parser')
                                before_modify_table = detail_soup.body.find_all(
                                    'table')[1]
                                table_data = self.parse_table(
                                    before_modify_table, 'before_modify',
                                    detail_page)
                                item[columns[col_count][0]] = self.parse_table(
                                    before_modify_table, 'before_modify',
                                    detail_page)
                                col_count += 1
                                after_modify_table = detail_soup.body.find_all(
                                    'table')[2]
                                item[columns[col_count][0]] = self.parse_table(
                                    after_modify_table, 'after_modify',
                                    detail_page)
                            else:
                                item[columns[col_count]
                                     [0]] = CrawlerUtils.get_raw_text_in_bstag(
                                         td)
                        else:
                            item[columns[col_count]
                                 [0]] = CrawlerUtils.get_raw_text_in_bstag(td)

                        col_count += 1
                        if col_count == column_size:
                            item_array.append(item.copy())
                            col_count = 0
            return item_array
예제 #9
0
파일: run.py 프로젝트: xiaohui2856/crawl
def main():
    config_logging()

    if not os.path.exists(settings.json_restore_path):
        CrawlerUtils.make_dir(settings.json_restore_path)

    cur_date = CrawlerUtils.get_cur_y_m_d()
    set_codecracker()

    if len(sys.argv) >= 2 and sys.argv[1] == "check":
        dt = None
        if len(sys.argv) == 3:
            dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d")
        checker = Checker(dt)
        checker.run()
        return

    if len(sys.argv) < 3:
        print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数,以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)'
        return

    try:
        max_crawl_time = int(sys.argv[1])
        settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time)
    except ValueError as e:
        settings.logger.error('invalid max_crawl_time, should be a integer')
        os._exit(1)

    timer = threading.Timer(max_crawl_time, force_exit)
    timer.start()

    settings.logger.info(u'即将开始爬取,最长爬取时间为 %s 秒' % settings.max_crawl_time)
    settings.start_crawl_time = datetime.datetime.now()

    if sys.argv[2] == 'all':
        args = [p for p in sorted(province_crawler.keys())]
        process_pool = MyPool()
        process_pool.map(crawl_province, args)
        process_pool.close()
        settings.logger.info("wait processes....")
        process_pool.join()
    else:
        provinces = sys.argv[2:]
        for p in provinces:
            if not p in province_crawler.keys():
                settings.logger.warn('province %s is not supported currently' %
                                     p)
                continue

            crawl_province(p)
예제 #10
0
def down_yesterday_pdf(yesterday):
    yesterday = yesterday
    abs_yesterday_json_url = '%s/%s/%s/%s/%s' % (settings.host, settings.ID,
                                                 yesterday[:4], yesterday[4:6],
                                                 yesterday[6:])
    # print 'abs_yesterday_json_url:', abs_yesterday_json_url
    need_down_json_file_name = get_need_down_json_file_name(
        abs_yesterday_json_url)
    if need_down_json_file_name is None:
        print '-error__from_%s____no_data' % abs_yesterday_json_url
        return
    else:
        abs_yesterday_json_url = '%s/%s' % (abs_yesterday_json_url,
                                            need_down_json_file_name)
        # print 'abs_yesterday_json_url:',abs_yesterday_json_url
        abs_json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_dir,
                                                yesterday[:4], yesterday[4:6],
                                                yesterday[6:])
        if not os.path.exists(abs_json_restore_dir):
            CrawlerUtils.make_dir(abs_json_restore_dir)
        abs_pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir,
                                               yesterday[:4], yesterday[4:6],
                                               yesterday[6:])
        if not os.path.exists(abs_pdf_restore_dir):
            CrawlerUtils.make_dir(abs_pdf_restore_dir)
        # print 'abs_json_restore_dir:', abs_json_restore_dir
        get_json_file_OK = get_data_json_file(abs_yesterday_json_url,
                                              abs_json_restore_dir,
                                              need_down_json_file_name)
        if get_json_file_OK is False:
            print '-error--nodata_from_%s%s' % (abs_json_restore_dir,
                                                need_down_json_file_name)
            return
        else:
            abs_yesterday_json_gz_file_name = '%s/%s' % (
                abs_json_restore_dir, need_down_json_file_name)
            abs_yesterday_json_file_name = '%s/%s%s' % (abs_json_restore_dir,
                                                        yesterday, '.json')
            # print 'abs_yesterday_json_file_name:',abs_yesterday_json_file_name
            # print 'abs_yesterday_json_gz_file_name:', abs_yesterday_json_gz_file_name
            g = gzip.GzipFile(mode='rb',
                              fileobj=open(abs_yesterday_json_gz_file_name,
                                           'rb'))
            open(abs_yesterday_json_file_name, 'wb').write(g.read())
            if os.path.isfile(abs_yesterday_json_gz_file_name):
                os.remove(abs_yesterday_json_gz_file_name)
            get_pdfs_from_data_json(abs_pdf_restore_dir,
                                    abs_yesterday_json_file_name)
    pass
예제 #11
0
 def get_page(self, type, tab):
     """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
     Args:
         tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
     """
     url = CrawlerUtils.add_params_to_url(
         self.urls[type], {
             'entId': self.ent_id,
             'ent_id': self.ent_id,
             'entid': self.ent_id,
             'credit_ticket': self.credit_ticket,
             'entNo': self.ent_number,
             'entName': '',
             'timeStamp': self.generate_time_stamp(),
             'clear': 'true',
             'str': tab
         })
     logging.error('get %s, url:\n%s\n' % (type, url))
     resp = self.crawl_page_by_url(url)
     if resp.status_code != 200:
         logging.error('get page failed by url %s' % url)
         return
     page = resp.content
     time.sleep(random.uniform(0.2, 1))
     return page
예제 #12
0
파일: run.py 프로젝트: xiaohui2856/crawl
def crawl_province(province):
    settings.logger.info('ready to clawer %s' % province)
    #创建存储路径
    json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_path, province,
                                        cur_date[0], cur_date[1])
    if not os.path.exists(json_restore_dir):
        CrawlerUtils.make_dir(json_restore_dir)

    #获取企业名单
    enterprise_list_path = settings.enterprise_list_path + province + '.txt'

    #json存储文件名
    json_restore_path = '%s/%s.json' % (json_restore_dir, cur_date[2])

    with open(enterprise_list_path) as f:
        for line in f:
            fields = line.strip().split(",")
            if len(fields) < 3:
                continue
            no = fields[2]
            process = multiprocessing.Process(target=crawl_work,
                                              args=(province,
                                                    json_restore_path, no))
            process.daemon = True
            process.start()
            process.join(300)

    settings.logger.info('All %s crawlers work over' % province)

    #压缩保存
    if not os.path.exists(json_restore_path):
        settings.logger.warn('json restore path %s does not exist!' %
                             json_restore_path)
        os._exit(1)
        return

    with open(json_restore_path, 'r') as f:
        data = f.read()
        compressed_json_restore_path = json_restore_path + '.gz'
        with gzip.open(compressed_json_restore_path, 'wb') as cf:
            cf.write(data)

    #删除json文件,只保留  .gz 文件
    os.remove(json_restore_path)
    os._exit(0)
예제 #13
0
 def get_list_table_items(self, table_tag):
     """获取记录类型的表格的结构
     """
     table_items = {}
     if len(table_tag.find_all('tr')) != 3:
         print 'abnormal list table skeleton, table_tag = ', table_tag
         return table_items
     ths = table_tag.find_all('tr')[1].find_all('th')
     tds = table_tag.find_all('tr')[2].find_all('td')
     if len(ths) != len(tds):
         print 'abnormal list table skeleton, table_tag = ', table_tag
         return table_items
     for index, td in enumerate(tds):
         table_items[
             CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace(
                 'PAPERS_',
                 '')] = CrawlerUtils.get_raw_text_in_bstag(ths[index])
     return table_items
예제 #14
0
    def run(self, ent_name=None):
        if ent_name is None:
            return False
        crawler = NameToIDCrawler(
            './enterprise_crawler/nametoid/name_to_id.json')
        crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ')
        # 对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + crawler.ent_name + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        page = crawler.crawl_page_by_get_params(crawler.ent_name)
        crawler.results = crawler.parser.parse_search_page(page=page)
        # 采用多线程,在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_name: crawler.results})
        self.write_file_mutex.release()
        return True
예제 #15
0
    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.uuid = self.get_id_num(findCode)
        print self.uuid
        self.result_json_dict = {}

        tableone = self.get_tables(self.uuid + '&tab=01')
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.uuid + '&tab=02')
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.uuid + '&tab=03')
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.uuid + '&tab=06')
        self.get_json_four(self.four_dict, tablefour)

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})
예제 #16
0
    def parse_list_table_without_sub_list(self, records_tag, table_name, page,
                                          columns):
        """提取没有子列的记录形式的表
        Args:
            records_tag: 表的记录标签,用beautiful soup 从html中提取出来的
            table_name: 表名
            page: 原始的html页面
            columns: 表的表头结构
        Returns:
            item_array: 提取出的表数据,列表形式,列表中的元素为一个python字典
        """
        item_array = []
        for tr in records_tag.find_all('tr'):
            col_count = 0
            item = {}
            for td in tr.find_all('td', recursive=False):
                if td.find('a'):
                    # try to retrieve detail link from page
                    next_url = self.get_detail_link(td.find('a'), page)
                    # has detail link
                    if next_url:
                        detail_page = self.crawler.crawl_page_by_url(next_url)
                        if table_name == 'ent_pub_ent_annual_report':
                            page_data = self.parse_ent_pub_annual_report_page(
                                detail_page)
                            item[u'报送年度'] = CrawlerUtils.get_raw_text_in_bstag(
                                td)
                            item[
                                u'详情'] = page_data  # this may be a detail page data
                        elif table_name == 'ind_comm_pub_reg_shareholder':
                            page_data = self.parse_ind_comm_pub_shareholder_detail_page(
                                detail_page)
                            item[u'详情'] = {u"投资人及出资信息": page_data}
                        else:
                            page_data = self.parse_page(
                                detail_page, table_name + '_detail')
                            item[columns[col_count][
                                0]] = page_data  # this may be a detail page data
                    else:
                        # item[columns[col_count]] = CrawlerUtils.get_raw_text_in_bstag(td)
                        item[columns[col_count][0]] = self.get_column_data(
                            columns[col_count][1], td)
                else:
                    item[columns[col_count][0]] = self.get_column_data(
                        columns[col_count][1], td)
                col_count += 1
            if item:
                item_array.append(item)

        return item_array
예제 #17
0
    def run(self, findCode):

        self.ent_number = str(findCode)

        if not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.result_json_dict = {}
        self.id = self.get_id_num(findCode)
        if self.id is None:
            return sjon.dumps({self.ent_number: {}})
        # print self.id
        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_one(self.one_dict, soup.find_all('table'))

        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_two(self.two_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' +
                              self.id,
                              timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_three(self.three_dict, soup.find_all('table'))

        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_four(self.four_dict, soup.find_all('table'))

        return json.dumps({self.ent_number: self.result_json_dict})
예제 #18
0
파일: run.py 프로젝트: xiaohui2856/crawl
def get_pdf(save_path, list_dict):

    pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, save_path[:4],
                                       save_path[4:6], save_path[6:])
    if not os.path.exists(pdf_restore_dir):
        CrawlerUtils.make_dir(pdf_restore_dir)

    for item in list_dict:
        pdf_url = item['pdf_url']
        count = 0
        while count < 10:
            resp = reqst.get(pdf_url)
            if resp.status_code == 200 and resp.content:
                with open(
                        os.path.join(pdf_restore_dir,
                                     pdf_url.rsplit('/')[-1]), 'wb') as f:
                    f.write(resp.content)
                break
            else:
                count += 1
                if count == 10:
                    print '%s,get-error' % pdf_url
                    # settings.logger.info('%s,get-error' % pdf_url)
                continue
예제 #19
0
    def run(self, findCode):
        self.ent_number = str(findCode)
        if not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.id = self.get_id_num(findCode)
        if self.id is None:
            return json.dumps({self.ent_number: {}})
        # print self.id
        self.result_json_dict = {}
        tableone = self.get_tables(self.search_dict['businessPublicity'] +
                                   'id=' + self.id)
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.search_dict['enterprisePublicity'] +
                                   'id=' + self.id)
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.search_dict['otherDepartment'] +
                                     'id=' + self.id)
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.search_dict['justiceAssistance'] +
                                    'id=' + self.id)
        self.get_json_four(self.four_dict, tablefour)

        return json.dumps({self.ent_number: self.result_json_dict})
예제 #20
0
    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.id = self.get_id_num(findCode)
        print self.id
        self.result_json_dict = {}
        #self.result_json_dict[findCode] = {}
        tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id)
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id)
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id)
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id)
        self.get_json_four(self.four_dict, tablefour)

        #self.write_file_mutex.acquire()
        print {self.ent_number: self.result_json_dict}
        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
예제 #21
0
    def run(self, ent_number=0):
        crawler = ChongqingClawer(
            './enterprise_crawler/chongqing/chongqing.json')

        crawler.ent_number = str(ent_number)
        # 对每个企业都指定一个html的存储目录
        crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)
        crawler.ent_number = str(ent_number)
        page = crawler.crawl_check_page()
        try:
            crawler.crawl_page_jsons(page)
            crawler.parser.parse_jsons()
            crawler.parser.merge_jsons()
        except Exception as e:
            # settings.logger.error('error')
            return False
        # 采用多线程,在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_number: crawler.json_dict})
        self.write_file_mutex.release()
        return True
예제 #22
0
    def get_detail_link(self, bs4_tag, page):
        """获取详情链接 url,在bs tag中或者page中提取详情页面
        Args:
            bs4_tag: beautifulsoup 的tag
            page: 页面数据
        """
        detail_op = bs4_tag.get('onclick')
        pat_view_info = re.compile(r'viewInfo\(\'([\w]+)\'\)')
        pat_show_dialog = re.compile(r'showDialog\(\'([^\'\n]+)\'')
        next_url = ''
        if detail_op and pat_view_info.search(detail_op):
            m = pat_view_info.search(detail_op)
            val = m.group(1)
            #detail link type 1, for example : ind_comm_pub info --- registration info -- shareholders info
            pat = re.compile(
                r'var +url += +rootPath +\+ +\"(.+\?)([\w]+)=\"\+[\w]+\+\"')
            m1 = pat.search(page)
            if m1:
                addition_url = m1.group(1)
                query_key = m1.group(2)

                next_url = CrawlerUtils.add_params_to_url(
                    self.crawler.urls['host'] + addition_url, {
                        query_key: val,
                        'entId': self.crawler.ent_id,
                        'ent_id': self.crawler.ent_id,
                        'entid': self.crawler.ent_id,
                        'credit_ticket': self.crawler.credit_ticket,
                        'entNo': self.crawler.ent_number
                    })
        elif detail_op and pat_show_dialog.search(detail_op):
            #detail link type 2, for example : ind_comm_pub_info --- registration info ---- modify info
            m = pat_show_dialog.search(detail_op)
            val = m.group(1)
            next_url = self.crawler.urls['host'] + val
        elif 'href' in bs4_tag.attrs.keys():
            #detail link type 3, for example : ent pub info ----- enterprise annual report
            next_url = self.crawler.urls['host'] + bs4_tag['href']

        return next_url
예제 #23
0
 def parse_annual_report_shareholder_info(self, page):
     """解析年报信息中的投资人信息
     需要单独处理
     """
     shareholder_info = []
     record_columns = [
         u'股东', u'认缴出资额', u'认缴出资时间', u'认缴出资方式', u'实缴出资额', u'实缴出资时间',
         u'实缴出资方式'
     ]
     json_obj = json.loads(page)
     for record in json_obj.get('items'):
         if not record.get('D1'):
             continue
         result = {}
         soup = BeautifulSoup(record.get('D1'), 'html.parser')
         tds = soup.find_all('td')
         if not tds:
             continue
         for index, column in enumerate(record_columns):
             result[column] = CrawlerUtils.get_raw_text_in_bstag(tds[index])
         shareholder_info.append(result)
     return shareholder_info
예제 #24
0
    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        nbxh = self.get_id_num(findCode)
        self.nbxh = nbxh

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '5')
        print result_dict
        self.get_json_one(allths=[
            u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所',
            u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb',
                              u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw',
                              u'djjgmc', u'hzrq', u'mclxmc'
                          ],
                          head='ind_comm_pub_reg_basic')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '3')
        print result_dict
        self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'],
                          alltds=result_dict,
                          alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'],
                          head='ind_comm_pub_reg_modify')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2',
            '3')
        print result_dict
        self.get_json_one(
            allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'],
            alltds=result_dict,
            alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'],
            head='ind_comm_pub_reg_shareholder')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '8')
        print result_dict
        self.get_json_one(allths=[u'序号', u'姓名', u'职务'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'xm', u'zwmc'],
                          head='ind_comm_pub_arch_key_persons')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '36')
        print result_dict
        self.get_json_one(allths=[u'清算负责人', u'清算组成员'],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_arch_liquidation')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '9')
        print result_dict
        self.get_json_one(
            allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'],
            alltds=result_dict,
            alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'],
            head='ind_comm_pub_arch_branch')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '25')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_movable_property_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '4')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码',
            u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_equity_ownership_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '1')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_administration_sanction')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '33')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_business_exception')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '34')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_serious_violate_law')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '35')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_spot_check')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '13')
        print result_dict
        self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'],
                          head='ent_pub_ent_annual_report')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '40')
        print result_dict
        self.get_json_two(allths=[
            u'股东', u'认缴额(万元)', u'实缴额(万元)', u'认缴出资方式', u'认缴出资额(万元)', u'认缴出资日期',
            u'认缴公示日期', u'实缴出资方式', u'实缴出资额(万元)', u'实缴出资日期', u'实缴公示日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'tzrmc', u'ljrje', u'ljsje', u'rjczfs',
                              u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs',
                              u'sjcze', u'sjczrq', u'sjgsrq'
                          ],
                          head='ent_pub_shareholder_capital_contribution')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '23')
        print result_dict
        self.get_json_two(
            allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'],
            alltds=result_dict,
            alltds_keys=[],
            head='ent_pub_equity_change')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '20')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容',
            u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx',
                              u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq',
                              u'lsh'
                          ],
                          head='ent_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '21')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_knowledge_property')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '22')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_shareholder_modify')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '37')
        print result_dict
        self.get_json_three(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关',
            u'许可内容', u'状态', u'详情'
        ],
                            alltds=result_dict,
                            alltds_keys=[
                                u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1',
                                u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt'
                            ],
                            head='other_dept_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '38')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称',
            u'作出行政处罚决定日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='other_dept_pub_administration_sanction')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '49')
        print result_dict
        self.get_json_four(allths=[
            u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情'
        ],
                           alltds=result_dict,
                           alltds_keys=[],
                           head='judical_assist_pub_equity_freeze')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '53')
        print result_dict
        self.get_json_four(
            allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'],
            alltds=result_dict,
            alltds_keys=[],
            head='judical_assist_pub_shareholder_modify')

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})
예제 #25
0
    def run(self, findCode):

        self.ent_number = findCode

        id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
        print id_args
        if id_args and id_args.download_args.get('uuid'):
            self.result_json_dict = {}
            self.uuid = id_args.download_args['uuid']

            tableone = self.get_tables(self.uuid + '&tab=01')
            self.get_json_one(self.one_dict, tableone)
            tabletwo = self.get_tables(self.uuid + '&tab=02')
            self.get_json_two(self.two_dict, tabletwo)
            tablethree = self.get_tables(self.uuid + '&tab=03')
            self.get_json_three(self.three_dict, tablethree)
            tablefour = self.get_tables(self.uuid + '&tab=06')
            self.get_json_four(self.four_dict, tablefour)

            CrawlerUtils.json_dump_to_file(
                'yunnan.json', {self.ent_number: self.result_json_dict})
            print json.dumps({self.ent_number: self.result_json_dict})
            return [{self.ent_number: self.result_json_dict}]
        else:
            #创建目录
            html_restore_path = self.json_restore_path + '/yunnan/'
            if not os.path.exists(html_restore_path):
                os.makedirs(html_restore_path)

            self.uuid = self.get_id_num(findCode)
            if self.uuid is None:
                return json.dumps({self.ent_number: {}})
            self.result_json_dict_list = []
            for div in BeautifulSoup(self.after_crack_checkcode_page,
                                     'html.parser').find_all(
                                         'div', attrs={'class': 'list-item'}):
                hrefa = div.find_all('a', attrs={'target': '_blank'})[0]
                if hrefa:
                    self.uuid = hrefa['href'].split('&')[0]
                    self.enterprise_name = div.find_all(
                        'div', attrs={'class': 'link'})[0].get_text().strip()
                    self.ent_number = div.find_all(
                        'span')[0].get_text().strip()

                    args =  CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\
                       or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
                       or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
                    if args:
                        args.delete()
                    args = CrawlerDownloadArgs(
                        province='yunnan',
                        register_number=self.ent_number,
                        unifield_number=self.ent_number,
                        enterprise_name=self.enterprise_name,
                        download_args={'uuid': self.uuid})
                    args.save()
                else:
                    continue
                print self.uuid
                self.result_json_dict = {}

                tableone = self.get_tables(self.uuid + '&tab=01')
                self.get_json_one(self.one_dict, tableone)
                tabletwo = self.get_tables(self.uuid + '&tab=02')
                self.get_json_two(self.two_dict, tabletwo)
                tablethree = self.get_tables(self.uuid + '&tab=03')
                self.get_json_three(self.three_dict, tablethree)
                tablefour = self.get_tables(self.uuid + '&tab=06')
                self.get_json_four(self.four_dict, tablefour)

                CrawlerUtils.json_dump_to_file(
                    'yunnan.json', {self.ent_number: self.result_json_dict})
                print json.dumps({self.ent_number: self.result_json_dict})
                self.result_json_dict_list.append(
                    {self.ent_number: self.result_json_dict})
            return self.result_json_dict_list
예제 #26
0
    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.pripid = self.get_id_num(findCode)
        print findCode, self.pripid
        self.result_json_dict = {}

        data = {
            'method': 'qyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk1',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        # print BeautifulSoup(resp.content).prettify
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'基本信息', u'股东信息', u'变更信息')

        data = {
            'method': 'baInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk2',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'主要人员信息', u'分支机构信息', u'清算信息')

        data = {
            'method': 'dcdyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=120)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'动产抵押登记信息')

        data = {
            'method': 'gqczxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权出质登记信息')

        data = {
            'method': 'jyycInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk6',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'经营异常信息')

        data = {
            'method': 'yzwfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk14',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'严重违法信息')

        data = {
            'method': 'cfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk3',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'ccjcInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk7',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'抽查检查信息')

        data = {
            'method': 'qygsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk8',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'企业年报')

        data = {
            'method': 'qygsForTzrxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk12',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股东及出资信息', u'变更信息')

        data = {
            'method': 'cqygsForTzrbgxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk15',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权变更信息')

        data = {
            'method': 'qygsForXzxkInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk10',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qygsForZzcqInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk11',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'知识产权出质登记信息')

        data = {
            'method': 'qygsForXzcfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk13',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'qtgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk9',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qtgsForCfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk16',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'sfgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk17',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股权冻结信息')

        data = {
            'method': 'sfgsbgInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk18',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股东变更登记信息')

        self.result_json_dict[
            'ind_comm_pub_reg_basic'] = self.result_json_dict[
                'ind_comm_pub_reg_basic'][0]
        if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys(
        ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0:
            self.result_json_dict[
                'ind_comm_pub_arch_liquidation'] = self.result_json_dict[
                    'ind_comm_pub_arch_liquidation'][0]
        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})
예제 #27
0
class TestParser(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        from CaptchaRecognition import CaptchaRecognition
        self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json')
        self.parser = self.crawler.parser
        ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
        self.crawler.json_dict = {}
        self.crawler.ent_number = '500232000003942'


if __name__ == '__main__':

    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")
    from CaptchaRecognition import CaptchaRecognition

    ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
    crawler = ChongqingClawer('./enterprise_crawler/chongqing/chongqing.json')
    start_time = time.localtime()
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/chongqing.txt')
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        print(
            '############   Start to crawl enterprise with id %s   ################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
예제 #28
0
    def parse_annual_report_skeleton(self, page):
        """解析 企业年报页面结构
        """
        #企业基本信息
        soup = BeautifulSoup(page, 'html.parser')
        annual_report_table_items = {}
        tag = soup.find('div', attrs={'id': 'qyjbxx'})
        if not tag:
            print 'parse annual report skeleton failed, do not find qyjbxx table'
            return
        table = tag.find('table', attrs={'class': 'detailsList'})
        if table:
            ent_basic_info_table = {}
            for tr in table.find_all('tr'):
                if tr.find('th') and tr.find('td'):
                    ths = tr.find_all('th')
                    tds = tr.find_all('td')
                    for index, td in enumerate(tds):
                        ent_basic_info_table[td.get(
                            'id')] = CrawlerUtils.get_raw_text_in_bstag(
                                ths[index])
            self.parse_table_items[
                'annual_report_ent_basic_info'] = ent_basic_info_table

        #网站或网店信息
        table = soup.find('table',
                          attrs={
                              'id': 'web',
                              'name': 'applicationList1TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_web_info'] = self.get_list_table_items(table)

        #股东及出资信息
        table = soup.find('table',
                          attrs={
                              'id': 'touziren',
                              'name': 'applicationList4TAB'
                          })
        if table:
            shareholder_info_table = {}

        #对外投资信息
        table = soup.find('table',
                          attrs={
                              'id': 'duiwaitouzi',
                              'name': 'applicationList3TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_investment_abord_info'] = self.get_list_table_items(
                    table)

        #企业资产状况信息
        for table in soup.find_all('table'):
            tr = table.find('tr')
            if tr and tr.find('th') and tr.find('th').text == u'企业资产状况信息':
                ent_property_info_table = {}
                for tr in table.find_all('tr'):
                    if tr.find('th') and tr.find('td'):
                        ths = tr.find_all('th')
                        tds = tr.find_all('td')
                        for index, td in enumerate(tds):
                            ent_property_info_table[td.get(
                                'id')] = CrawlerUtils.get_raw_text_in_bstag(
                                    ths[index])
                self.parse_table_items[
                    'annual_report_ent_property_info'] = ent_property_info_table
                break

        #对外提供担保信息
        table = soup.find('table',
                          attrs={
                              'id': 'duiwaidanbao',
                              'name': 'applicationList6TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_external_guarantee_info'] = self.get_list_table_items(
                    table)

        #股权变更信息
        table = soup.find('table',
                          attrs={
                              'id': 'guquanchange',
                              'name': 'applicationList5TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_equity_modify_info'] = self.get_list_table_items(
                    table)

        #修改记录
        table = soup.find('table',
                          attrs={
                              'id': 'modifyRecord',
                              'name': 'applicationList2TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_modify_record'] = self.get_list_table_items(
                    table)

        self.annual_report_skeleton_built = True
예제 #29
0
        'open_detail_info_entry': ''
    }

    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HunanParser(self)


class HunanParser(ZongjuParser):
    def __init__(self, crawler):
        self.crawler = crawler


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    HunanCrawler.code_cracker = CaptchaRecognition('hunan')

    crawler = HunanCrawler('./enterprise_crawler/hunan.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/hunan.txt')
    # enterprise_list = ['430000000011972']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###################   Start to crawl enterprise with id %s   ###################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
예제 #30
0
 def get_year_of_annual_report(page):
     soup = BeautifulSoup(page, 'html.parser')
     t = soup.body.find('table')
     return CrawlerUtils.get_raw_text_in_bstag(t.find('tr'))