'open_detail_info_entry': '' } def __init__(self, json_restore_path): ZongjuCrawler.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = HunanParser(self) class HunanParser(ZongjuParser): def __init__(self, crawler): self.crawler = crawler if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() HunanCrawler.code_cracker = CaptchaRecognition('hunan') crawler = HunanCrawler('./enterprise_crawler/hunan.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/hunan.txt') # enterprise_list = ['430000000011972'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)
class TestParser(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) from CaptchaRecognition import CaptchaRecognition self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json') self.parser = self.crawler.parser ChongqingClawer.code_cracker = CaptchaRecognition('chongqing') self.crawler.json_dict = {} self.crawler.ent_number = '500232000003942' if __name__ == '__main__': import sys reload(sys) sys.setdefaultencoding("utf-8") from CaptchaRecognition import CaptchaRecognition ChongqingClawer.code_cracker = CaptchaRecognition('chongqing') crawler = ChongqingClawer('./enterprise_crawler/chongqing/chongqing.json') start_time = time.localtime() enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/chongqing.txt') for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') print( '############ Start to crawl enterprise with id %s ################\n' % ent_number) crawler.run(ent_number=ent_number)
page = f.read() result = self.parser.parse_ent_pub_annual_report_page(page) CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path, {self.crawler.ent_number: result}) def test_parse_shareholder_detail_page(self): with open('./enterprise_crawler/zongju/shareholder_detail.html') as f: page = f.read() result = self.parser.parse_ind_comm_pub_shareholder_detail_page( page) CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path, {self.crawler.ent_number: result}) if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() ZongjuCrawler.code_cracker = CaptchaRecognition('zongju') crawler = ZongjuCrawler('./enterprise_crawler/zongju.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/zongju.txt') # enterprise_list = ['100000000018305'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '############### Start to crawl enterprise with id %s ################\n' % ent_number) crawler.run(ent_number=ent_number)
'http://218.26.1.108/QueryYearExamineDetail.jspx?id=', # 企业年报详情 } def __init__(self, json_restore_path): HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = ShanxiParser(self) class ShanxiParser(HeilongjiangParser): def __init__(self, crawler): self.crawler = crawler if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() ShanxiCrawler.code_cracker = CaptchaRecognition('shanxi') crawler = ShanxiCrawler('./enterprise_crawler/shanxi.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/shanxi.txt') # enterprise_list = ['310000000007622'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)
'http://gsxt.xzaic.gov.cn/QueryYearExamineDetail.jspx?id=', # 企业年报详情 } def __init__(self, json_restore_path): HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = XizangParser(self) class XizangParser(HeilongjiangParser): def __init__(self, crawler): self.crawler = crawler if __name__ == '__main__': # from CaptchaRecognition import CaptchaRecognition import run run.config_logging() # XizangCrawler.code_cracker = CaptchaRecognition('xizang') crawler = XizangCrawler('./enterprise_crawler/xizang.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/xizang.txt') #enterprise_list = ['5400001000374'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)
table_tds = tr.find_all("td") for td in table_tds: if 'colspan' in td.attrs: continue else: table_td.append(td.text.strip()) if table_td: for i in range(0, len(table_th)): table_save = {} table_save[table_th[i]] = table_td[i] total.append(table_save) return total if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() LiaoningCrawler.code_cracker = CaptchaRecognition('liaoning') crawler = LiaoningCrawler('./enterprise_crawler/liaoning.json') enterprise_list = CrawlerUtils.get_enterprise_list('./enterprise_list/liaoning.txt') # enterprise_list = ['210000004920321'] # enterprise_list = ['210200400016720'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info('################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)
'http://218.95.241.36/QueryYearExamineDetail.jspx?id=', # 企业年报详情 } def __init__(self, json_restore_path): HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = QinghaiParser(self) class QinghaiParser(HeilongjiangParser): def __init__(self, crawler): self.crawler = crawler if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() QinghaiCrawler.code_cracker = CaptchaRecognition('qinghai') crawler = QinghaiCrawler('./enterprise_crawler/qinghai.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/qinghai.txt') # enterprise_list = ["630000400003574"] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)