def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) settings.logger.info('get %s, url:\n%s\n' % (type, url)) resp = self.reqst.get(url) if resp.status_code != 200: settings.logger.warn('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + type + '.html', page) return page
def test_parse_shareholder_detail_page(self): with open('./enterprise_crawler/zongju/shareholder_detail.html') as f: page = f.read() result = self.parser.parse_ind_comm_pub_shareholder_detail_page( page) CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path, {self.crawler.ent_number: result})
def get_pdfs_from_data_json(abs_pdf_restore_dir, json_file_name): f = open(json_file_name, 'r') for line in f.readlines(): list_dict = json.loads(line)['list'] for i, item in enumerate(list_dict): # print i,'---------' # print item pdf_url = item['pdf_url'] count = 0 resp = None while count < 10: resp = reqst.get(pdf_url) if resp.status_code == 200 and resp.content: with open( '%s/%s' % (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]), 'wb') as f: f.write(resp.content) break else: count += 1 if count == 10: print '%s, get_error_pdf' % pdf_url continue if count != 10: list_dict[i]['abs_path'] = '%s/%s' % (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]) # print list_dict CrawlerUtils.json_dump_to_file( '%s%s%s' % (json_file_name[:-5], '_insert', json_file_name[-5:]), {'list': list_dict}) f.close()
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.result_json_dict = {} self.id = self.get_id_num(findCode) print self.id resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_one(self.one_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_two(self.two_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_three(self.three_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_four(self.four_dict, soup.find_all('table')) CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def crawl_page_by_url(self, url): """根据url直接爬取页面 """ resp = self.reqst.get(url) if self.reqst.status_code != 200: settings.logger.error('crawl page by url failed! url = %s' % url) page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + 'detail.html', page) return page
def get_dict_table_items(self, table_tag): """获得字典类型的表格的结构 """ table_items = {} for tr in table_tag.find_all('tr'): if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') for index, td in enumerate(tds): table_items[CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace('PAPERS_', '')]\ = CrawlerUtils.get_raw_text_in_bstag(ths[index]) return table_items
def crawl_page_by_url(self, url): """通过url直接获取页面 """ resp = self.reqst.get(url, verify=False) if resp.status_code != 200: settings.logger.error('failed to crawl page by url' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + 'detail.html', page) return page
def parse_ind_comm_pub_reg_modify_table(self, bs_table, table_name, page): """解析工商公示信息-注册信息-变更信息表格,由于含有详情页,需要单独处理 """ tbody = bs_table.find('tbody') if tbody: columns = self.get_columns_of_record_table(bs_table, page) column_size = len(columns) item_array = [] for tr in tbody.find_all('tr'): if tr.find('td'): col_count = 0 item = {} for td in tr.find_all('td'): if td.find('a'): #try to retrieve detail link from page next_url = self.get_detail_link(td.find('a'), page) #has detail link if next_url: detail_page = self.crawler.crawl_page_by_url( next_url).content detail_soup = BeautifulSoup( detail_page, 'html.parser') before_modify_table = detail_soup.body.find_all( 'table')[1] table_data = self.parse_table( before_modify_table, 'before_modify', detail_page) item[columns[col_count][0]] = self.parse_table( before_modify_table, 'before_modify', detail_page) col_count += 1 after_modify_table = detail_soup.body.find_all( 'table')[2] item[columns[col_count][0]] = self.parse_table( after_modify_table, 'after_modify', detail_page) else: item[columns[col_count] [0]] = CrawlerUtils.get_raw_text_in_bstag( td) else: item[columns[col_count] [0]] = CrawlerUtils.get_raw_text_in_bstag(td) col_count += 1 if col_count == column_size: item_array.append(item.copy()) col_count = 0 return item_array
def main(): config_logging() if not os.path.exists(settings.json_restore_path): CrawlerUtils.make_dir(settings.json_restore_path) cur_date = CrawlerUtils.get_cur_y_m_d() set_codecracker() if len(sys.argv) >= 2 and sys.argv[1] == "check": dt = None if len(sys.argv) == 3: dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d") checker = Checker(dt) checker.run() return if len(sys.argv) < 3: print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数,以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)' return try: max_crawl_time = int(sys.argv[1]) settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time) except ValueError as e: settings.logger.error('invalid max_crawl_time, should be a integer') os._exit(1) timer = threading.Timer(max_crawl_time, force_exit) timer.start() settings.logger.info(u'即将开始爬取,最长爬取时间为 %s 秒' % settings.max_crawl_time) settings.start_crawl_time = datetime.datetime.now() if sys.argv[2] == 'all': args = [p for p in sorted(province_crawler.keys())] process_pool = MyPool() process_pool.map(crawl_province, args) process_pool.close() settings.logger.info("wait processes....") process_pool.join() else: provinces = sys.argv[2:] for p in provinces: if not p in province_crawler.keys(): settings.logger.warn('province %s is not supported currently' % p) continue crawl_province(p)
def down_yesterday_pdf(yesterday): yesterday = yesterday abs_yesterday_json_url = '%s/%s/%s/%s/%s' % (settings.host, settings.ID, yesterday[:4], yesterday[4:6], yesterday[6:]) # print 'abs_yesterday_json_url:', abs_yesterday_json_url need_down_json_file_name = get_need_down_json_file_name( abs_yesterday_json_url) if need_down_json_file_name is None: print '-error__from_%s____no_data' % abs_yesterday_json_url return else: abs_yesterday_json_url = '%s/%s' % (abs_yesterday_json_url, need_down_json_file_name) # print 'abs_yesterday_json_url:',abs_yesterday_json_url abs_json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_dir, yesterday[:4], yesterday[4:6], yesterday[6:]) if not os.path.exists(abs_json_restore_dir): CrawlerUtils.make_dir(abs_json_restore_dir) abs_pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, yesterday[:4], yesterday[4:6], yesterday[6:]) if not os.path.exists(abs_pdf_restore_dir): CrawlerUtils.make_dir(abs_pdf_restore_dir) # print 'abs_json_restore_dir:', abs_json_restore_dir get_json_file_OK = get_data_json_file(abs_yesterday_json_url, abs_json_restore_dir, need_down_json_file_name) if get_json_file_OK is False: print '-error--nodata_from_%s%s' % (abs_json_restore_dir, need_down_json_file_name) return else: abs_yesterday_json_gz_file_name = '%s/%s' % ( abs_json_restore_dir, need_down_json_file_name) abs_yesterday_json_file_name = '%s/%s%s' % (abs_json_restore_dir, yesterday, '.json') # print 'abs_yesterday_json_file_name:',abs_yesterday_json_file_name # print 'abs_yesterday_json_gz_file_name:', abs_yesterday_json_gz_file_name g = gzip.GzipFile(mode='rb', fileobj=open(abs_yesterday_json_gz_file_name, 'rb')) open(abs_yesterday_json_file_name, 'wb').write(g.read()) if os.path.isfile(abs_yesterday_json_gz_file_name): os.remove(abs_yesterday_json_gz_file_name) get_pdfs_from_data_json(abs_pdf_restore_dir, abs_yesterday_json_file_name) pass
def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) logging.error('get %s, url:\n%s\n' % (type, url)) resp = self.crawl_page_by_url(url) if resp.status_code != 200: logging.error('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) return page
def crawl_province(province): settings.logger.info('ready to clawer %s' % province) #创建存储路径 json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_path, province, cur_date[0], cur_date[1]) if not os.path.exists(json_restore_dir): CrawlerUtils.make_dir(json_restore_dir) #获取企业名单 enterprise_list_path = settings.enterprise_list_path + province + '.txt' #json存储文件名 json_restore_path = '%s/%s.json' % (json_restore_dir, cur_date[2]) with open(enterprise_list_path) as f: for line in f: fields = line.strip().split(",") if len(fields) < 3: continue no = fields[2] process = multiprocessing.Process(target=crawl_work, args=(province, json_restore_path, no)) process.daemon = True process.start() process.join(300) settings.logger.info('All %s crawlers work over' % province) #压缩保存 if not os.path.exists(json_restore_path): settings.logger.warn('json restore path %s does not exist!' % json_restore_path) os._exit(1) return with open(json_restore_path, 'r') as f: data = f.read() compressed_json_restore_path = json_restore_path + '.gz' with gzip.open(compressed_json_restore_path, 'wb') as cf: cf.write(data) #删除json文件,只保留 .gz 文件 os.remove(json_restore_path) os._exit(0)
def get_list_table_items(self, table_tag): """获取记录类型的表格的结构 """ table_items = {} if len(table_tag.find_all('tr')) != 3: print 'abnormal list table skeleton, table_tag = ', table_tag return table_items ths = table_tag.find_all('tr')[1].find_all('th') tds = table_tag.find_all('tr')[2].find_all('td') if len(ths) != len(tds): print 'abnormal list table skeleton, table_tag = ', table_tag return table_items for index, td in enumerate(tds): table_items[ CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace( 'PAPERS_', '')] = CrawlerUtils.get_raw_text_in_bstag(ths[index]) return table_items
def run(self, ent_name=None): if ent_name is None: return False crawler = NameToIDCrawler( './enterprise_crawler/nametoid/name_to_id.json') crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ') # 对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + crawler.ent_name + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) page = crawler.crawl_page_by_get_params(crawler.ent_name) crawler.results = crawler.parser.parse_search_page(page=page) # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_name: crawler.results}) self.write_file_mutex.release() return True
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.uuid = self.get_id_num(findCode) print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def parse_list_table_without_sub_list(self, records_tag, table_name, page, columns): """提取没有子列的记录形式的表 Args: records_tag: 表的记录标签,用beautiful soup 从html中提取出来的 table_name: 表名 page: 原始的html页面 columns: 表的表头结构 Returns: item_array: 提取出的表数据,列表形式,列表中的元素为一个python字典 """ item_array = [] for tr in records_tag.find_all('tr'): col_count = 0 item = {} for td in tr.find_all('td', recursive=False): if td.find('a'): # try to retrieve detail link from page next_url = self.get_detail_link(td.find('a'), page) # has detail link if next_url: detail_page = self.crawler.crawl_page_by_url(next_url) if table_name == 'ent_pub_ent_annual_report': page_data = self.parse_ent_pub_annual_report_page( detail_page) item[u'报送年度'] = CrawlerUtils.get_raw_text_in_bstag( td) item[ u'详情'] = page_data # this may be a detail page data elif table_name == 'ind_comm_pub_reg_shareholder': page_data = self.parse_ind_comm_pub_shareholder_detail_page( detail_page) item[u'详情'] = {u"投资人及出资信息": page_data} else: page_data = self.parse_page( detail_page, table_name + '_detail') item[columns[col_count][ 0]] = page_data # this may be a detail page data else: # item[columns[col_count]] = CrawlerUtils.get_raw_text_in_bstag(td) item[columns[col_count][0]] = self.get_column_data( columns[col_count][1], td) else: item[columns[col_count][0]] = self.get_column_data( columns[col_count][1], td) col_count += 1 if item: item_array.append(item) return item_array
def run(self, findCode): self.ent_number = str(findCode) if not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.result_json_dict = {} self.id = self.get_id_num(findCode) if self.id is None: return sjon.dumps({self.ent_number: {}}) # print self.id resp = self.reqst.get( 'http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_one(self.one_dict, soup.find_all('table')) resp = self.reqst.get( 'http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_two(self.two_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_three(self.three_dict, soup.find_all('table')) resp = self.reqst.get( 'http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_four(self.four_dict, soup.find_all('table')) return json.dumps({self.ent_number: self.result_json_dict})
def get_pdf(save_path, list_dict): pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, save_path[:4], save_path[4:6], save_path[6:]) if not os.path.exists(pdf_restore_dir): CrawlerUtils.make_dir(pdf_restore_dir) for item in list_dict: pdf_url = item['pdf_url'] count = 0 while count < 10: resp = reqst.get(pdf_url) if resp.status_code == 200 and resp.content: with open( os.path.join(pdf_restore_dir, pdf_url.rsplit('/')[-1]), 'wb') as f: f.write(resp.content) break else: count += 1 if count == 10: print '%s,get-error' % pdf_url # settings.logger.info('%s,get-error' % pdf_url) continue
def run(self, findCode): self.ent_number = str(findCode) if not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.id = self.get_id_num(findCode) if self.id is None: return json.dumps({self.ent_number: {}}) # print self.id self.result_json_dict = {} tableone = self.get_tables(self.search_dict['businessPublicity'] + 'id=' + self.id) self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.search_dict['enterprisePublicity'] + 'id=' + self.id) self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.search_dict['otherDepartment'] + 'id=' + self.id) self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.search_dict['justiceAssistance'] + 'id=' + self.id) self.get_json_four(self.four_dict, tablefour) return json.dumps({self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.id = self.get_id_num(findCode) print self.id self.result_json_dict = {} #self.result_json_dict[findCode] = {} tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id) self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id) self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id) self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id) self.get_json_four(self.four_dict, tablefour) #self.write_file_mutex.acquire() print {self.ent_number: self.result_json_dict} CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, ent_number=0): crawler = ChongqingClawer( './enterprise_crawler/chongqing/chongqing.json') crawler.ent_number = str(ent_number) # 对每个企业都指定一个html的存储目录 crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) crawler.ent_number = str(ent_number) page = crawler.crawl_check_page() try: crawler.crawl_page_jsons(page) crawler.parser.parse_jsons() crawler.parser.merge_jsons() except Exception as e: # settings.logger.error('error') return False # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_number: crawler.json_dict}) self.write_file_mutex.release() return True
def get_detail_link(self, bs4_tag, page): """获取详情链接 url,在bs tag中或者page中提取详情页面 Args: bs4_tag: beautifulsoup 的tag page: 页面数据 """ detail_op = bs4_tag.get('onclick') pat_view_info = re.compile(r'viewInfo\(\'([\w]+)\'\)') pat_show_dialog = re.compile(r'showDialog\(\'([^\'\n]+)\'') next_url = '' if detail_op and pat_view_info.search(detail_op): m = pat_view_info.search(detail_op) val = m.group(1) #detail link type 1, for example : ind_comm_pub info --- registration info -- shareholders info pat = re.compile( r'var +url += +rootPath +\+ +\"(.+\?)([\w]+)=\"\+[\w]+\+\"') m1 = pat.search(page) if m1: addition_url = m1.group(1) query_key = m1.group(2) next_url = CrawlerUtils.add_params_to_url( self.crawler.urls['host'] + addition_url, { query_key: val, 'entId': self.crawler.ent_id, 'ent_id': self.crawler.ent_id, 'entid': self.crawler.ent_id, 'credit_ticket': self.crawler.credit_ticket, 'entNo': self.crawler.ent_number }) elif detail_op and pat_show_dialog.search(detail_op): #detail link type 2, for example : ind_comm_pub_info --- registration info ---- modify info m = pat_show_dialog.search(detail_op) val = m.group(1) next_url = self.crawler.urls['host'] + val elif 'href' in bs4_tag.attrs.keys(): #detail link type 3, for example : ent pub info ----- enterprise annual report next_url = self.crawler.urls['host'] + bs4_tag['href'] return next_url
def parse_annual_report_shareholder_info(self, page): """解析年报信息中的投资人信息 需要单独处理 """ shareholder_info = [] record_columns = [ u'股东', u'认缴出资额', u'认缴出资时间', u'认缴出资方式', u'实缴出资额', u'实缴出资时间', u'实缴出资方式' ] json_obj = json.loads(page) for record in json_obj.get('items'): if not record.get('D1'): continue result = {} soup = BeautifulSoup(record.get('D1'), 'html.parser') tds = soup.find_all('td') if not tds: continue for index, column in enumerate(record_columns): result[column] = CrawlerUtils.get_raw_text_in_bstag(tds[index]) shareholder_info.append(result) return shareholder_info
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) nbxh = self.get_id_num(findCode) self.nbxh = nbxh result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '5') print result_dict self.get_json_one(allths=[ u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所', u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态' ], alltds=result_dict, alltds_keys=[ u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb', u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw', u'djjgmc', u'hzrq', u'mclxmc' ], head='ind_comm_pub_reg_basic') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '3') print result_dict self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'], alltds=result_dict, alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'], head='ind_comm_pub_reg_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2', '3') print result_dict self.get_json_one( allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'], alltds=result_dict, alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'], head='ind_comm_pub_reg_shareholder') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '8') print result_dict self.get_json_one(allths=[u'序号', u'姓名', u'职务'], alltds=result_dict, alltds_keys=[u'rownum', u'xm', u'zwmc'], head='ind_comm_pub_arch_key_persons') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '36') print result_dict self.get_json_one(allths=[u'清算负责人', u'清算组成员'], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_arch_liquidation') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '9') print result_dict self.get_json_one( allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'], alltds=result_dict, alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'], head='ind_comm_pub_arch_branch') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '25') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_movable_property_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '4') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码', u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_equity_ownership_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '1') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '33') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_business_exception') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '34') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_serious_violate_law') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '35') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_spot_check') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '13') print result_dict self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'], alltds=result_dict, alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'], head='ent_pub_ent_annual_report') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '40') print result_dict self.get_json_two(allths=[ u'股东', u'认缴额(万元)', u'实缴额(万元)', u'认缴出资方式', u'认缴出资额(万元)', u'认缴出资日期', u'认缴公示日期', u'实缴出资方式', u'实缴出资额(万元)', u'实缴出资日期', u'实缴公示日期' ], alltds=result_dict, alltds_keys=[ u'tzrmc', u'ljrje', u'ljsje', u'rjczfs', u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs', u'sjcze', u'sjczrq', u'sjgsrq' ], head='ent_pub_shareholder_capital_contribution') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '23') print result_dict self.get_json_two( allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'], alltds=result_dict, alltds_keys=[], head='ent_pub_equity_change') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '20') print result_dict self.get_json_two(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx', u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq', u'lsh' ], head='ent_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '21') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_knowledge_property') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '22') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_shareholder_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '37') print result_dict self.get_json_three(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关', u'许可内容', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1', u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt' ], head='other_dept_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '38') print result_dict self.get_json_two(allths=[ u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称', u'作出行政处罚决定日期' ], alltds=result_dict, alltds_keys=[], head='other_dept_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '49') print result_dict self.get_json_four(allths=[ u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_equity_freeze') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '53') print result_dict self.get_json_four( allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_shareholder_modify') CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = findCode id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() print id_args if id_args and id_args.download_args.get('uuid'): self.result_json_dict = {} self.uuid = id_args.download_args['uuid'] tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) return [{self.ent_number: self.result_json_dict}] else: #创建目录 html_restore_path = self.json_restore_path + '/yunnan/' if not os.path.exists(html_restore_path): os.makedirs(html_restore_path) self.uuid = self.get_id_num(findCode) if self.uuid is None: return json.dumps({self.ent_number: {}}) self.result_json_dict_list = [] for div in BeautifulSoup(self.after_crack_checkcode_page, 'html.parser').find_all( 'div', attrs={'class': 'list-item'}): hrefa = div.find_all('a', attrs={'target': '_blank'})[0] if hrefa: self.uuid = hrefa['href'].split('&')[0] self.enterprise_name = div.find_all( 'div', attrs={'class': 'link'})[0].get_text().strip() self.ent_number = div.find_all( 'span')[0].get_text().strip() args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() if args: args.delete() args = CrawlerDownloadArgs( province='yunnan', register_number=self.ent_number, unifield_number=self.ent_number, enterprise_name=self.enterprise_name, download_args={'uuid': self.uuid}) args.save() else: continue print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) self.result_json_dict_list.append( {self.ent_number: self.result_json_dict}) return self.result_json_dict_list
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.pripid = self.get_id_num(findCode) print findCode, self.pripid self.result_json_dict = {} data = { 'method': 'qyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk1', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) # print BeautifulSoup(resp.content).prettify self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'基本信息', u'股东信息', u'变更信息') data = { 'method': 'baInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk2', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'主要人员信息', u'分支机构信息', u'清算信息') data = { 'method': 'dcdyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=120) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'动产抵押登记信息') data = { 'method': 'gqczxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'股权出质登记信息') data = { 'method': 'jyycInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk6', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'经营异常信息') data = { 'method': 'yzwfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk14', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'严重违法信息') data = { 'method': 'cfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk3', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'ccjcInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk7', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'抽查检查信息') data = { 'method': 'qygsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk8', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'企业年报') data = { 'method': 'qygsForTzrxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk12', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股东及出资信息', u'变更信息') data = { 'method': 'cqygsForTzrbgxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk15', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股权变更信息') data = { 'method': 'qygsForXzxkInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk10', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qygsForZzcqInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk11', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'知识产权出质登记信息') data = { 'method': 'qygsForXzcfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk13', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'qtgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk9', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qtgsForCfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk16', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'sfgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk17', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股权冻结信息') data = { 'method': 'sfgsbgInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk18', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股东变更登记信息') self.result_json_dict[ 'ind_comm_pub_reg_basic'] = self.result_json_dict[ 'ind_comm_pub_reg_basic'][0] if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys( ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0: self.result_json_dict[ 'ind_comm_pub_arch_liquidation'] = self.result_json_dict[ 'ind_comm_pub_arch_liquidation'][0] CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
class TestParser(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) from CaptchaRecognition import CaptchaRecognition self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json') self.parser = self.crawler.parser ChongqingClawer.code_cracker = CaptchaRecognition('chongqing') self.crawler.json_dict = {} self.crawler.ent_number = '500232000003942' if __name__ == '__main__': import sys reload(sys) sys.setdefaultencoding("utf-8") from CaptchaRecognition import CaptchaRecognition ChongqingClawer.code_cracker = CaptchaRecognition('chongqing') crawler = ChongqingClawer('./enterprise_crawler/chongqing/chongqing.json') start_time = time.localtime() enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/chongqing.txt') for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') print( '############ Start to crawl enterprise with id %s ################\n' % ent_number) crawler.run(ent_number=ent_number)
def parse_annual_report_skeleton(self, page): """解析 企业年报页面结构 """ #企业基本信息 soup = BeautifulSoup(page, 'html.parser') annual_report_table_items = {} tag = soup.find('div', attrs={'id': 'qyjbxx'}) if not tag: print 'parse annual report skeleton failed, do not find qyjbxx table' return table = tag.find('table', attrs={'class': 'detailsList'}) if table: ent_basic_info_table = {} for tr in table.find_all('tr'): if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') for index, td in enumerate(tds): ent_basic_info_table[td.get( 'id')] = CrawlerUtils.get_raw_text_in_bstag( ths[index]) self.parse_table_items[ 'annual_report_ent_basic_info'] = ent_basic_info_table #网站或网店信息 table = soup.find('table', attrs={ 'id': 'web', 'name': 'applicationList1TAB' }) if table: self.parse_table_items[ 'annual_report_web_info'] = self.get_list_table_items(table) #股东及出资信息 table = soup.find('table', attrs={ 'id': 'touziren', 'name': 'applicationList4TAB' }) if table: shareholder_info_table = {} #对外投资信息 table = soup.find('table', attrs={ 'id': 'duiwaitouzi', 'name': 'applicationList3TAB' }) if table: self.parse_table_items[ 'annual_report_investment_abord_info'] = self.get_list_table_items( table) #企业资产状况信息 for table in soup.find_all('table'): tr = table.find('tr') if tr and tr.find('th') and tr.find('th').text == u'企业资产状况信息': ent_property_info_table = {} for tr in table.find_all('tr'): if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') for index, td in enumerate(tds): ent_property_info_table[td.get( 'id')] = CrawlerUtils.get_raw_text_in_bstag( ths[index]) self.parse_table_items[ 'annual_report_ent_property_info'] = ent_property_info_table break #对外提供担保信息 table = soup.find('table', attrs={ 'id': 'duiwaidanbao', 'name': 'applicationList6TAB' }) if table: self.parse_table_items[ 'annual_report_external_guarantee_info'] = self.get_list_table_items( table) #股权变更信息 table = soup.find('table', attrs={ 'id': 'guquanchange', 'name': 'applicationList5TAB' }) if table: self.parse_table_items[ 'annual_report_equity_modify_info'] = self.get_list_table_items( table) #修改记录 table = soup.find('table', attrs={ 'id': 'modifyRecord', 'name': 'applicationList2TAB' }) if table: self.parse_table_items[ 'annual_report_modify_record'] = self.get_list_table_items( table) self.annual_report_skeleton_built = True
'open_detail_info_entry': '' } def __init__(self, json_restore_path): ZongjuCrawler.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = HunanParser(self) class HunanParser(ZongjuParser): def __init__(self, crawler): self.crawler = crawler if __name__ == '__main__': from CaptchaRecognition import CaptchaRecognition import run run.config_logging() HunanCrawler.code_cracker = CaptchaRecognition('hunan') crawler = HunanCrawler('./enterprise_crawler/hunan.json') enterprise_list = CrawlerUtils.get_enterprise_list( './enterprise_list/hunan.txt') # enterprise_list = ['430000000011972'] for ent_number in enterprise_list: ent_number = ent_number.rstrip('\n') settings.logger.info( '################### Start to crawl enterprise with id %s ###################\n' % ent_number) crawler.run(ent_number=ent_number)
def get_year_of_annual_report(page): soup = BeautifulSoup(page, 'html.parser') t = soup.body.find('table') return CrawlerUtils.get_raw_text_in_bstag(t.find('tr'))