def __init__(self, json_restore_path=None): headers = { #'Connetion': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', "User-Agent": get_user_agent() } self.CR = CaptchaRecognition("hebei") self.requests = requests.Session() self.requests.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.requests.mount('http://', adapter) self.ents = {} self.json_dict = {} self.json_restore_path = json_restore_path self.csrf = "" #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/hebei/' self.proxies = get_proxy('hebei') self.timeout = (30, 20)
def __init__(self, json_restore_path): self.CR = CaptchaRecognition("hebei") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.json_restore_path = json_restore_path self.csrf = "" #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/hebei/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = settings.json_restore_path + '/hebei/'
def __init__(self, json_restore_path): self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.main_host = "" self.json_dict = {} self.json_restore_path = json_restore_path self.html_restore_path = settings.json_restore_path + '/hainan/' #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/hainan/ckcode.png'
def __init__(self, json_restore_path=None): self.html_search = None self.html_showInfo = None self.Captcha = None self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.main_host = "" self.json_dict = {} self.json_restore_path = json_restore_path self.dir_restore_path = settings.json_restore_path + '/guangdong/' #self.json_restore_path = settings.json_restore_path + '/guangdong.json' #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/guangdong/ckcode.jpg'
class FujianCrawler(ZongjuCrawler): """福建爬虫 """ #html数据的存储路径 html_restore_path = settings.json_restore_path + '/fujian/' #验证码图片的存储路径 ckcode_image_path = settings.json_restore_path + '/fujian/ckcode.jpg' code_cracker = CaptchaRecognition('fujian') #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = { 'host': 'http://www.fjaic.gov.cn/', 'official_site': 'http://wsgs.fjaic.gov.cn/creditpub/home', 'get_checkcode': 'http://wsgs.fjaic.gov.cn/creditpub/captcha?preset=math-01', 'post_checkcode': 'http://wsgs.fjaic.gov.cn/creditpub/security/verify_captcha', 'get_info_entry': 'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list', 'open_info_entry': 'http://wsgs.fjaic.gov.cn/creditpub/notice/view?', #获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...) 'open_detail_info_entry': '', } def __init__(self, json_restore_path): ZongjuCrawler.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = FujianParser(self)
class HunanCrawler(ZongjuCrawler): """湖南爬虫 """ #html数据的存储路径 html_restore_path = settings.json_restore_path + '/hunan/' #验证码图片的存储路径 ckcode_image_path = settings.json_restore_path + '/hunan/ckcode.jpg' code_cracker = CaptchaRecognition('hunan') #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = {'host': 'http://www.hnaic.net.cn/visit/category/a/hnaicalllist', 'official_site': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list', 'get_checkcode': 'http://gsxt.hnaic.gov.cn/notice/captcha?preset=', 'post_checkcode': 'http://gsxt.hnaic.gov.cn/notice/search/popup_captcha', 'get_info_entry': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list', # 获得企业入口 'open_info_entry': 'http://gsxt.hnaic.gov.cn/notice/notice/view?', # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...) 'open_detail_info_entry': '', } def __init__(self, json_restore_path): ZongjuCrawler.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = HunanParser(self)
def __init__(self, *args, **kwargs): self.ckcode_image_path = settings.json_restore_path + '/anhui/ckcode.jpg' self.code_cracker = CaptchaRecognition('qinghai') if not os.path.exists(os.path.dirname(self.ckcode_image_path)): os.makedirs(os.path.dirname(self.ckcode_image_path)) self.urls = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://www.ahcredit.gov.cn/search.jspx', 'checkCheckNo': 'http://www.ahcredit.gov.cn/checkCheckNo.jspx', 'searchList': 'http://www.ahcredit.gov.cn/searchList.jspx', 'validateCode': 'http://www.ahcredit.gov.cn/validateCode.jspx?type=0&id=0.22788021906613765', 'QueryInvList': 'http://www.ahcredit.gov.cn/QueryInvList.jspx?', 'queryInvDetailAction': 'http://www.ahcredit.gov.cn/queryInvDetailAction.jspx?', 'businessPublicity': 'http://www.ahcredit.gov.cn/businessPublicity.jspx?', 'enterprisePublicity': 'http://www.ahcredit.gov.cn/enterprisePublicity.jspx?', 'otherDepartment': 'http://www.ahcredit.gov.cn/otherDepartment.jspx?', 'justiceAssistance': 'http://www.ahcredit.gov.cn/justiceAssistance.jspx?' } self.timeout = 30 self.result_json = {} self.result_json_list = [] pass
def __init__(self, json_restore_path): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ # json 数据集 # POST self.json_restore_path = json_restore_path if os.path.exists(self.json_restore_path) is False: os.makedirs(self.json_restore_path, 0775) self.parser = ChongqingParser(self) self.credit_ticket = None #html数据的存储路径 self.html_restore_path = os.path.join(self.json_restore_path, "/chongqing/") if os.path.exists(self.html_restore_path) is False: os.makedirs(self.html_restore_path, 0775) #验证码图片的存储路径 self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg') self.code_cracker = CaptchaRecognition("chongqing") self.ent_number = None # GET self.ckcode = None self.json_ent_info = None self.json_sfxzgdbg = None self.json_sfxz = None self.json_other_qlicinfo = None self.json_other_qpeninfo = None self.json_year_report = None self.json_year_report_detail = None self.json_year_daily_transinfo = None self.json_year_daily_invsub = None self.json_year_daily_peninfo = None self.json_year_daily_licinfo = None self.json_year_daily_pleinfo = None self.json_dict = {} self.json_restore_path = json_restore_path self.parser = ChongqingParser(self) self.reqst = requests.Session() self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'})
def setUp(self): unittest.TestCase.setUp(self) from CaptchaRecognition import CaptchaRecognition self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json') self.parser = self.crawler.parser ChongqingClawer.code_cracker = CaptchaRecognition('chongqing') self.crawler.json_dict = {} self.crawler.ent_number = '500232000003942'
class HubeiCrawler(HeilongjiangClawer): """湖北爬虫 """ #html数据的存储路径 html_restore_path = settings.json_restore_path + '/hubei/' #验证码图片的存储路径 ckcode_image_path = settings.json_restore_path + '/hubei/ckcode.jpg' code_cracker = CaptchaRecognition('hubei') #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = { 'host': 'www.hljaic.gov.cn', 'get_checkcode': 'http://xyjg.egs.gov.cn/ECPS_HB/validateCode.jspx?type=0', 'post_checkcode': 'http://xyjg.egs.gov.cn/ECPS_HB/checkCheckNo.jspx', 'get_info_entry': 'http://xyjg.egs.gov.cn/ECPS_HB/searchList.jspx', 'ind_comm_pub_skeleton': 'http://xyjg.egs.gov.cn/ECPS_HB/businessPublicity.jspx?id=', 'ent_pub_skeleton': 'http://xyjg.egs.gov.cn/ECPS_HB/enterprisePublicity.jspx?id=', 'other_dept_pub_skeleton': 'http://xyjg.egs.gov.cn/ECPS_HB/otherDepartment.jspx?id=', 'judical_assist_skeleton': 'http://xyjg.egs.gov.cn/ECPS_HB/justiceAssistance.jspx?id=', 'ind_comm_pub_reg_shareholder': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryInvList.jspx?', # 股东信息 'ind_comm_pub_reg_modify': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryAltList.jspx?', # 变更信息翻页 'ind_comm_pub_arch_key_persons': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryMemList.jspx?', # 主要人员信息翻页 'ind_comm_pub_spot_check': 'http://xyjg.egs.gov.cn/ECPS_HB/QuerySpotCheckList.jspx?', # 抽样检查信息翻页 'ind_comm_pub_movable_property_reg': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryMortList.jspx?', # 动产抵押登记信息翻页 'ind_comm_pub_business_exception': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryExcList.jspx?', # 经营异常信息 'ind_comm_pub_equity_ownership_reg': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryPledgeList.jspx?', # 股权出质登记信息翻页 'ind_comm_pub_arch_branch': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryChildList.jspx?', # 分支机构信息 'shareholder_detail': 'http://xyjg.egs.gov.cn/ECPS_HB/queryInvDetailAction.jspx?id=', # 投资人详情 'movable_property_reg_detail': 'http://xyjg.egs.gov.cn/ECPS_HB/mortInfoDetail.jspx?id=', # 动产抵押登记详情 'annual_report': 'http://xyjg.egs.gov.cn/ECPS_HB/QueryYearExamineDetail.jspx?id=', # 企业年报详情 } def __init__(self, json_restore_path): HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path self.parser = HubeiParser(self)
def __init__(self, json_restore_path=None): self.html_showInfo = None self.Captcha = None self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.requests.mount('http://', adapter) self.ents = {} self.json_restore_path = json_restore_path self.dir_restore_path = self.json_restore_path + '/neimenggu/' #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/neimenggu/ckcode.jpg' self.timeout = (30, 20) proxies = get_proxy('neimenggu') if proxies: print proxies self.requests.proxies = proxies
def __init__(self, json_restore_path=None): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ super(ChongqingCrawler, self).__init__() self.json_restore_path = json_restore_path #html数据的存储路径 self.html_restore_path = os.path.join(self.json_restore_path, "chongqing") #验证码图片的存储路径 self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg') self.code_cracker = CaptchaRecognition("chongqing") self.parser = ChongqingParser(self) self.credit_ticket = None self.ent_number = None self.ents = {} # GET self.ckcode = None self.json_ent_info = None self.json_sfxzgdbg = None self.json_sfxz = None self.json_other_qlicinfo = None self.json_other_qpeninfo = None self.json_year_report = None self.json_year_report_detail = None self.json_year_daily_transinfo = None self.json_year_daily_invsub = None self.json_year_daily_peninfo = None self.json_year_daily_licinfo = None self.json_year_daily_pleinfo = None
def __init__(self, *args, **kwargs): """江苏工商公示信息网页爬虫初始化函数 Args: json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ self.ent_number = None #html数据的存储路径 self.html_restore_path = settings.json_restore_path + '/jiangsu/' # 验证码图片的存储路径 self.ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg' if not os.path.exists(os.path.dirname(self.ckcode_image_path)): os.makedirs(os.path.dirname(self.ckcode_image_path)) self.code_cracker = CaptchaRecognition('jiangsu') #多线程爬取时往最后的json文件中写时的加锁保护 self.write_file_mutex = threading.Lock() self.urls = { 'host': 'www.jsgsj.gov.cn', 'official_site': 'http://www.jsgsj.gov.cn:58888/province/', 'get_checkcode': 'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7', 'post_checkcode': 'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true', 'ind_comm_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp', 'ent_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp', 'other_dept_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp', 'judical_assist_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp', 'annual_report_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp', 'ci_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true', 'common_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true', 'nb_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true', 'ci_detail': 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true' } self.result_json = {} self.result_json_list = []
def __init__(self, *args, **kwargs): # 验证码图片的存储路径 self.ckcode_image_path = settings.json_restore_path + '/zongju/ckcode.jpg' self.code_cracker = CaptchaRecognition('zongju') if not os.path.exists(os.path.dirname(self.ckcode_image_path)): os.makedirs(os.path.dirname(self.ckcode_image_path)) # 多线程爬取时往最后的json文件中写时的加锁保护 self.write_file_mutex = threading.Lock() self.timeout = 40 self.urls = { 'host': 'http://qyxy.baic.gov.cn', 'official_site': 'http://gsxt.saic.gov.cn/zjgs/', 'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=', 'post_checkcode': 'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list', # 'get_info_entry': 'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list', # 获得企业入口 'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?', # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...) 'open_detail_info_entry': '' }
class ChongqingCrawler(Crawler): """重庆工商公示信息网页爬虫,集成Crawler基类 """ # 多线程爬取时往最后的json文件中写时的加锁保护 urls = { 'host': 'http://gsxt.cqgs.gov.cn', 'get_checkcode': 'http://gsxt.cqgs.gov.cn/sc.action?width=130&height=40', 'repost_checkcode': 'http://gsxt.cqgs.gov.cn/search_research.action', # 获得查询页面 'post_checkcode': 'http://gsxt.cqgs.gov.cn/search.action', # 根据查询页面获得指定公司的数据 'search_ent': 'http://gsxt.cqgs.gov.cn/search_getEnt.action', # 年报 'year_report': 'http://gsxt.cqgs.gov.cn/search_getYearReport.action', # 年报详情 'year_report_detail': 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action', # 股权变更 'year_daily_transinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 股东出资信息 'year_daily_invsub': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 行政处罚 'year_daily_peninfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 行政许可 'year_daily_licinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 知识产权出质登记 'year_daily_pleinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 其他行政许可信息 'other_qlicinfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action', # 其他行政处罚 'other_qpeninfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action', # 股权冻结信息 'sfxz_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZ.action', # 股东变更信息 'sfxzgdbg_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZGDBG.action', } write_file_mutex = threading.Lock() def __init__(self, json_restore_path=None): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ super(ChongqingCrawler, self).__init__() self.json_restore_path = json_restore_path #html数据的存储路径 self.html_restore_path = os.path.join(self.json_restore_path, "chongqing") #验证码图片的存储路径 self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg') self.code_cracker = CaptchaRecognition("chongqing") self.parser = ChongqingParser(self) self.credit_ticket = None self.ent_number = None self.ents = {} # GET self.ckcode = None self.json_ent_info = None self.json_sfxzgdbg = None self.json_sfxz = None self.json_other_qlicinfo = None self.json_other_qpeninfo = None self.json_year_report = None self.json_year_report_detail = None self.json_year_daily_transinfo = None self.json_year_daily_invsub = None self.json_year_daily_peninfo = None self.json_year_daily_licinfo = None self.json_year_daily_pleinfo = None def run(self, ent_number): print self.__class__.__name__ logging.error('crawl %s .', self.__class__.__name__) self.ent_number = str(ent_number) if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) self.crawl_check_page() if not self.ents: return json.dumps([{self.ent_number: None}]) data = self.crawl_main_page() return json.dumps(data) def analyze_showInfo(self, page): # 解析供查询页面, 获得工商信息页面POST值 soup = BeautifulSoup(page, "html5lib") result = soup.find('div', {'id': 'result'}) if result is None: return None items = result.find_all('div', {'class': 'item'}) if items: count = 0 Ent = {} for item in items: count += 1 key_map = {} link = item.find('a') entId = link.get('data-entid') types = link.get('data-type') ids = link.get('data-id') name = link.get_text().strip() key_map['entId'] = entId key_map['type'] = types key_map['id'] = ids key_map['name'] = name profile = item.find('span', attrs={ 'class': 'value' }).get_text().strip() if name == self.ent_number: Ent.clear() Ent[profile] = key_map break if key_map is not None: Ent[profile] = key_map if count == 3: break self.ents = Ent return True return False def crawl_check_page(self): """爬取验证码页面,包括下载验证码图片以及破解验证码 :return true or false """ count = 0 while count < 30: count += 1 ck_code = self.crack_check_code() data = {'key': self.ent_number, 'code': ck_code} resp = self.reqst.post(ChongqingCrawler.urls['post_checkcode'], data=data) if resp.status_code != 200: logging.error("crawl post check page failed!") continue if self.analyze_showInfo(resp.content): return True time.sleep(random.uniform(1, 3)) return False def crack_check_code(self): """破解验证码 :return 破解后的验证码 """ resp = self.reqst.get(ChongqingCrawler.urls['get_checkcode']) if resp.status_code != 200: logging.error('failed to get get_checkcode') return None time.sleep(random.uniform(0.1, 0.2)) self.write_file_mutex.acquire() with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.warn('exception occured when crack checkcode') ckcode = ('', '') self.write_file_mutex.release() return ckcode[1] def crawl_main_page(self): """获取所有界面的json数据""" sub_json_list = [] for ent, data in self.ents.items(): self.json_dict = {} try: if data is not None: self.json_ent_info = None self.json_sfxzgdbg = None self.json_sfxz = None self.json_other_qlicinfo = None self.json_other_qpeninfo = None self.json_year_report = None self.json_year_report_detail = [] self.json_year_daily_transinfo = None self.json_year_daily_invsub = None self.json_year_daily_peninfo = None self.json_year_daily_licinfo = None self.json_year_daily_pleinfo = None self.crawl_ent_info_json(data) self.crawl_year_report_json(data) self.crawl_year_report_detail_json(data) time.sleep(0.1) self.crawl_sfxzgdbg_json(data) time.sleep(0.1) self.crawl_sfxz_json(data) time.sleep(0.1) self.crawl_year_daily_invsub_json(data) time.sleep(0.1) self.crawl_year_daily_licinfo_json(data) time.sleep(0.1) self.crawl_year_daily_peninfo_json(data) time.sleep(0.1) self.crawl_year_daily_transinfo_json(data) time.sleep(0.1) self.crawl_year_daily_pleinfo_json(data) time.sleep(0.1) self.crawl_other_qpeninfo_json(data) time.sleep(0.1) self.crawl_other_qlicinfo_json(data) else: continue self.parser.parse_jsons() self.parser.merge_jsons() except Exception as e: logging.error('%s .' % (traceback.format_exc(10))) sub_json_list.append({ent: self.json_dict}) return sub_json_list def crawl_ent_info_json(self, data, type=1): """企业详细信息""" params = { 'entId': data.get('entId'), 'id': data.get('id'), 'type': type } json_data = self.reqst.get(ChongqingCrawler.urls['search_ent'], params=params) if json_data.status_code == 200: json_data = json_data.content json_data = str(json_data) self.json_ent_info = json_data[6:] # 去掉数据中的前六个字符保证数据为完整json格式数据 if self.json_ent_info is None or 'base' not in self.json_ent_info: self.crawl_ent_info_json(data, type=10) # 有些公司需要传过去的参数为 10 # print(self.json_ent_info) def crawl_year_report_json(self, data): """年报数据""" params = {'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingCrawler.urls['year_report'], params=params) while json_data.status_code != 200: json_data = self.reqst.get(ChongqingCrawler.urls['year_report'], params=params) json_data = json_data.content json_data = str(json_data) self.json_year_report = json_data[6:] # 去掉数据中的前六个字符保证数据为完整json格式数据 # print(self.json_year_report) def crawl_year_report_detail_json(self, data): """详细年报""" # TO DO 需要获得 year_report 中的年份信息 while self.json_year_report is None: self.crawl_year_report_json(data) year_report = json.loads(self.json_year_report, encoding='utf-8') histories = year_report.get('history') for i in range(len(histories)): sub_json_dict = {} sub_json_dict.update(histories[i]) year = histories[i].get('year') params = {'id': data.get('id'), 'type': 1, 'year': str(year)} json_data = self.reqst.get( ChongqingCrawler.urls['year_report_detail'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data sub_json_dict['detail'] = json.loads(str(json_data.content)) self.json_year_report_detail.append(sub_json_dict) # print(self.json_year_report_detail) def crawl_year_daily_transinfo_json(self, data): """股权变更""" params = {'id': data.get('id'), 'jtype': 'transinfo'} json_data = self.reqst.get( ChongqingCrawler.urls['year_daily_transinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_transinfo = json_data[6:] # print(self.json_year_daily_transinfo) def crawl_year_daily_pleinfo_json(self, data): """行政许可""" params = {'id': data.get('id'), 'jtype': 'pleinfo'} json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_pleinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_pleinfo = json_data[6:] # print(self.json_year_daily_pleinfo) def crawl_year_daily_invsub_json(self, data): """股东出资信息""" params = {'id': data.get('id'), 'jtype': 'invsub'} json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_invsub'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_invsub = json_data[6:] # print(self.json_year_daily_invsub) def crawl_year_daily_licinfo_json(self, data): """行政许可""" params = {'id': data.get('id'), 'jtype': 'licinfo'} json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_licinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_licinfo = json_data[6:] # print(self.json_year_daily_licinfo) def crawl_year_daily_peninfo_json(self, data): """行政处罚""" params = {'id': data.get('id'), 'jtype': 'peninfo'} json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_peninfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_peninfo = json_data[6:] # print(self.json_year_daily_peninfo) def crawl_sfxzgdbg_json(self, data): """股东变更信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingCrawler.urls['sfxzgdbg_page'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_sfxzgdbg = json_data[6:] # print(self.json_sfxzgdbg) def crawl_sfxz_json(self, data): """股权冻结信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingCrawler.urls['sfxz_page'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_sfxz = json_data[6:] # print(self.json_sfxz) def crawl_other_qlicinfo_json(self, data): """股东出资信息""" params = { 'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qlicinfo', 'type': 1 } json_data = self.reqst.get(ChongqingCrawler.urls['other_qlicinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_other_qlicinfo = json_data[6:] # print(self.json_other_qlicinfo) def crawl_other_qpeninfo_json(self, data): """股东出资信息""" params = { 'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qpeninfo', 'type': 1 } json_data = self.reqst.get(ChongqingCrawler.urls['other_qpeninfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_other_qpeninfo = json_data[6:]
def __init__(self, json_restore_path=None): self.cur_time = str(int(time.time() * 1000)) self.nbxh = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.html_restore_path = self.json_restore_path + '/guizhou/' self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg' self.code_cracker = CaptchaRecognition('guizhou') self.result_json_dict = {} self.reqst.headers.update({ 'Connection': "keep-alive", 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': get_user_agent() }) self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.gzgs.gov.cn/', 'searchList': 'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml', 'validateCode': 'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {}
class GuizhouCrawler(object): """ 贵州省爬虫, 单独爬取 """ #html数据的存储路径 write_file_mutex = threading.Lock() def __init__(self, json_restore_path=None): self.cur_time = str(int(time.time() * 1000)) self.nbxh = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.html_restore_path = self.json_restore_path + '/guizhou/' self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg' self.code_cracker = CaptchaRecognition('guizhou') self.result_json_dict = {} self.reqst.headers.update({ 'Connection': "keep-alive", 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': get_user_agent() }) self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.gzgs.gov.cn/', 'searchList': 'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml', 'validateCode': 'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {} def get_check_num(self): # print self.mydict['search'] resp = None search_count = 0 while search_count < 5: try: resp = self.reqst.get(self.mydict['search']) except: search_count += 1 continue if resp.status_code == 200: break else: search_count += 1 continue if resp.status_code != 200: return None validate_count = 0 while validate_count < 5: try: resp = self.reqst.get(self.mydict['validateCode'] + self.cur_time) except: validate_count += 1 continue if resp.status_code == 200: break else: validate_count += 1 continue if resp.status_code != 200: # print 'no validateCode' return None # print self.ckcode_image_path with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) ck_code = self.code_cracker.predict_result(self.ckcode_image_path) # return ck_code[1] if not ck_code is None: return ck_code[1] else: return None def send_post_for_enter(self, host, nbxh, c, t, lsh): count = 0 while count < 10: data = {'nbxh': nbxh, 'c': c, 't': t, 'lsh': lsh} try: resp = self.reqst.post(host, data=data) except: count += 1 continue if resp.status_code == 200: return resp.content else: count += 1 continue def get_dict_enter(self, allths, alltds, alltds_keys): alltds = json.loads(alltds) # print alltds_keys if not alltds[u'data']: return [] else: temp_alltds = [] for item in alltds[u'data']: for key in alltds_keys: if item[key] is False or item[key] == '' or item[ key] == None: temp_alltds.append(item[key]) else: temp_alltds.append(str(item[key]).strip()) return self.get_one_to_one_dict(allths, temp_alltds) def help_get_dict_form_enter(self, lsh): needdict = {} result_dict = self.send_post_for_enter( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh, '0', '14', lsh) # print result_dict value = self.get_dict_enter( allths=[ u'注册号/统一社会信用代码', u'企业名称', u'企业联系电话', u'邮政编码', u'企业通信地址', u'企业电子邮箱', u'有限责任公司本年度是否发生股东股权转让', u'企业经营状态', u'是否有网站或网店', u'是否有投资信息或购买其他公司股权', u'从业人数' ], alltds=result_dict, alltds_keys=[ u'zch', u'qymc', u'lxdh', u'yzbm', u'dz', u'dzyx', u'sfzr', u'jyzt', u'sfww', u'sfdw', u'cyrs' ], ) needdict[u'企业基本信息'] = value[0] result_dict = self.send_post_for_enter( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh, '0', '15', lsh) value = self.get_dict_enter( allths=[u'类型', u'名称', u'网址'], alltds=result_dict, alltds_keys=[], ) needdict[u'网站或网店信息'] = value result_dict = self.send_post_for_enter( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh, '0', '19', lsh) value = self.get_dict_enter(allths=[ u'注册号/股东', u'认缴出资额(万元)', u'认缴出资时间', u'认缴出资方式', u'实缴出资额(万元)', u'出资时间', u'出资方式' ], alltds=result_dict, alltds_keys=[ u'tzr', u'rjcze', u'rjczrq', u'rjczfs', u'sjcze', u'sjczrq', u'sjczfs' ]) needdict[u'股东及出资信息'] = value result_dict = self.send_post_for_enter( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh, '0', '16', lsh) value = self.get_dict_enter(allths=[ u'资产总额', u'所有者权益合计', u'销售总额', u'利润总额', u'销售总额中主营业务收入', u'净利润', u'纳税总额', u'负债总额' ], alltds=result_dict, alltds_keys=[ u'zcze', u'qyhj', u'xsze', u'lrze', u'zysr', u'lrze', u'nsze', u'fzze' ]) needdict[u'企业资产状况信息'] = value[0] result_dict = self.send_post_for_enter( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh, '0', '41', lsh) value = self.get_dict_enter( allths=[u'序号', u'修改事项', u'修改前', u'修改后', u'修改日期'], alltds=result_dict, alltds_keys=[u'rownum', u'bgsxmc', u'bgq', u'bgh', u'bgrq']) needdict[u'修改记录'] = value needdict[u'对外投资信息'] = [] needdict[u'对外提供保证担保信息'] = [] needdict[u'股权变更信息'] = [] return needdict def get_id_num(self, findCode): count = 0 while count < 20: count += 1 # print self.cur_time yzm = self.get_check_num() data = {'q': findCode, 'validCode': yzm} resp = self.reqst.post(self.mydict['searchList'], data=data) if resp.status_code != 200: logging.error("status code of searchList page is not 200.") time.sleep(random.uniform(1, 3)) continue result_dict = json.loads(resp.content) # print result_dict if result_dict['successed'] == True: try: datas = result_dict.get('data') if not datas: return False # 只取前三个 self.ents = datas[0:3] return True except: return False break else: logging.error('The count is %d.' % (count)) return False def get_one_to_one_dict(self, allths, alltds): # if len(allths) == len(alltds): # one_to_one_dict = {} # for key, value in zip(allths, alltds): # one_to_one_dict[key] = value # return one_to_one_dict # else: templist = [] x = 0 y = x + len(allths) while y <= len(alltds): tempdict = {} for keys, values in zip(allths, alltds[x:y]): tempdict[keys] = values x = y y = x + len(allths) templist.append(tempdict) return templist def test_print_table(self, tables): for table in tables: print table def test_print_all_ths_tds(self, head, allths, alltds): print '--------------', head, '--------------' for th in allths: print th for td in alltds: print td def test_print_all_dict(self, mydict): for key, value in mydict.items(): print key, ':', value def get_json_one(self, allths, alltds, alltds_keys, head): alltds = json.loads(alltds) # print alltds if not alltds[u'data']: self.result_json_dict[head] = [] else: temp_alltds = [] for item in alltds[u'data']: for key in alltds_keys: temp_alltds.append(item[key]) if head == 'ind_comm_pub_reg_shareholder': temp_alltds.append(None) if head == u'ind_comm_pub_reg_basic' or head == u'ind_comm_pub_arch_liquidation': self.result_json_dict[head] = self.get_one_to_one_dict( allths, temp_alltds)[0] else: self.result_json_dict[head] = self.get_one_to_one_dict( allths, temp_alltds) pass def get_json_two(self, allths, alltds, alltds_keys, head): alltds = json.loads(alltds) # print alltds_keys if not alltds[u'data']: self.result_json_dict[head] = [] else: temp_alltds = [] for item in alltds[u'data']: for key in alltds_keys: if head == u'ent_pub_ent_annual_report' and key == 'lsh': if item[key] is False or item[key] == '' or item[ key] == None: temp_alltds.append(None) else: temp_alltds.append( self.help_get_dict_form_enter(item[key])) elif head == u'ent_pub_administration_license' and key == 'lsh': if item[key] is False or item[key] == '' or item[ key] == None: temp_alltds.append(None) else: temp_alltds.append([]) else: temp_alltds.append(item[key]) self.result_json_dict[head] = self.get_one_to_one_dict( allths, temp_alltds) # if not alltds pass def get_json_three(self, allths, alltds, alltds_keys, head): alltds = json.loads(alltds) # print alltds_keys if not alltds[u'data']: self.result_json_dict[head] = [] else: temp_alltds = [] for item in alltds[u'data']: for key in alltds_keys: temp_alltds.append(item[key]) self.result_json_dict[head] = self.get_one_to_one_dict( allths, temp_alltds) pass def get_json_four(self, allths, alltds, alltds_keys, head): alltds = json.loads(alltds) # print alltds_keys if not alltds[u'data']: self.result_json_dict[head] = [] else: temp_alltds = [] for item in alltds[u'data']: for key in alltds_keys: temp_alltds.append(item[key]) self.result_json_dict[head] = self.get_one_to_one_dict( allths, temp_alltds) pass def send_post(self, host, nbxh, c, t): count = 0 while count < 10: data = {'nbxh': nbxh, 'c': c, 't': t} try: resp = self.reqst.post(host, data=data) except: count += 1 continue if resp.status_code == 200: return resp.content else: count += 1 continue def run(self, findCode): print self.__class__.__name__ logging.error('crawl %s .', self.__class__.__name__) self.ent_number = str(findCode) if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) result = self.get_id_num(self.ent_number) if not result: return json.dumps([{self.ent_number: None}]) json_list = [] for item in self.ents: self.nbxh = item.get('nbxh') zch = item.get('zch') self.result_json_dict = {} result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '5') # print result_dict self.get_json_one(allths=[ u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所', u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态' ], alltds=result_dict, alltds_keys=[ u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb', u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw', u'djjgmc', u'hzrq', u'mclxmc' ], head='ind_comm_pub_reg_basic') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '3') print result_dict self.get_json_one( allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'], alltds=result_dict, alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'], head='ind_comm_pub_reg_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '2', '3') # print result_dict self.get_json_one( allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'], alltds=result_dict, alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'], head='ind_comm_pub_reg_shareholder') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '8') # print result_dict self.get_json_one(allths=[u'序号', u'姓名', u'职务'], alltds=result_dict, alltds_keys=[u'rownum', u'xm', u'zwmc'], head='ind_comm_pub_arch_key_persons') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '36') # print result_dict self.get_json_one(allths=[u'清算负责人', u'清算组成员'], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_arch_liquidation') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '9') # print result_dict self.get_json_one( allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'], alltds=result_dict, alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'], head='ind_comm_pub_arch_branch') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '25') # print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_movable_property_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '4') # print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码', u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_equity_ownership_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '1') # print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '33') # print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_business_exception') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '34') # print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_serious_violate_law') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '35') # print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_spot_check') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '13') # print result_dict self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'], alltds=result_dict, alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'], head='ent_pub_ent_annual_report') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '40') # print result_dict self.get_json_two(allths=[ u'股东', u'认缴额(万元)', u'实缴额(万元)', u'认缴出资方式', u'认缴出资额(万元)', u'认缴出资日期', u'认缴公示日期', u'实缴出资方式', u'实缴出资额(万元)', u'实缴出资日期', u'实缴公示日期' ], alltds=result_dict, alltds_keys=[ u'tzrmc', u'ljrje', u'ljsje', u'rjczfs', u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs', u'sjcze', u'sjczrq', u'sjgsrq' ], head='ent_pub_shareholder_capital_contribution') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '23') # print result_dict self.get_json_two(allths=[ u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期' ], alltds=result_dict, alltds_keys=[], head='ent_pub_equity_change') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '20') # print result_dict self.get_json_two(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx', u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq', u'lsh' ], head='ent_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '21') # print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_knowledge_property') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '22') # print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_shareholder_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', self.nbxh, '0', '37') # print result_dict self.get_json_three(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关', u'许可内容', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1', u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt' ], head='other_dept_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', self.nbxh, '0', '38') # print result_dict self.get_json_two(allths=[ u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称', u'作出行政处罚决定日期' ], alltds=result_dict, alltds_keys=[], head='other_dept_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '49') # print result_dict self.get_json_four(allths=[ u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_equity_freeze') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', self.nbxh, '0', '53') # print result_dict self.get_json_four( allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_shareholder_modify') json_list.append({zch: self.result_json_dict}) return json.dumps(json_list)
def __init__(self, json_restore_path=None): self.pripid = None self.cur_time = str(int(time.time() * 1000)) self.reqst = requests.Session() self.reqst.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.reqst.mount('http://', adapter) self.json_restore_path = json_restore_path self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/sichuan/' self.code_cracker = CaptchaRecognition('sichuan') self.result_json_dict = {} self.json_list = [] proxies = get_proxy('shaanxi') if proxies: print proxies self.reqst.proxies = proxies self.timeout = (30, 20) self.ents = {} self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=', 'searchList': 'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=', 'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm' } self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check'} self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction'} self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'}
def __init__(self, json_restore_path): self.id = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg' if not os.path.exists(os.path.dirname(self.ckcode_image_path)): os.makedirs(os.path.dirname(self.ckcode_image_path)) self.result_json_dict = {} self.code_cracker = CaptchaRecognition('yunnan') self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0' }) useproxy = UseProxy() is_use_proxy = useproxy.get_province_is_use_proxy(province='guangxi') if not is_use_proxy: self.proxies = [] else: proxy = Proxy() self.proxies = { 'http': 'http://' + random.choice(proxy.get_proxy(num=5, province='guangxi')), 'https': 'https://' + random.choice(proxy.get_proxy(num=5, province='guangxi')) } print 'self.proxies:', self.proxies # self.proxies = [] self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.ynaic.gov.cn/notice/', 'searchList': 'http://gsxt.ynaic.gov.cn/notice/search/ent_info_list', 'validateCode': 'http://gsxt.ynaic.gov.cn/notice/captcha?preset=&ra=0.06570781518790503' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'合伙人信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {}
class YunnanCrawler(object): ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg' def __init__(self, json_restore_path): self.id = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg' if not os.path.exists(os.path.dirname(self.ckcode_image_path)): os.makedirs(os.path.dirname(self.ckcode_image_path)) self.result_json_dict = {} self.code_cracker = CaptchaRecognition('yunnan') self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0' }) useproxy = UseProxy() is_use_proxy = useproxy.get_province_is_use_proxy(province='guangxi') if not is_use_proxy: self.proxies = [] else: proxy = Proxy() self.proxies = { 'http': 'http://' + random.choice(proxy.get_proxy(num=5, province='guangxi')), 'https': 'https://' + random.choice(proxy.get_proxy(num=5, province='guangxi')) } print 'self.proxies:', self.proxies # self.proxies = [] self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.ynaic.gov.cn/notice/', 'searchList': 'http://gsxt.ynaic.gov.cn/notice/search/ent_info_list', 'validateCode': 'http://gsxt.ynaic.gov.cn/notice/captcha?preset=&ra=0.06570781518790503' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'合伙人信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {} def get_check_num(self): resp = self.reqst.get(self.mydict['search'], proxies=self.proxies, timeout=120) if resp.status_code != 200: return None first = resp.content.find('session.token":') session_token = resp.content[first + 17:first + 53] resp = self.reqst.get(self.mydict['validateCode'], proxies=self.proxies, timeout=120) if resp.status_code != 200: # print 'no validateCode' return None with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) ck_code = self.code_cracker.predict_result(self.ckcode_image_path) if ck_code is None: return None, None else: return ck_code[1], session_token def get_id_num(self, findCode): count = 0 while count < 20: check_num, session_token = self.get_check_num() # print check_num if check_num is None: count += 1 continue data = { 'searchType': '1', 'captcha': check_num, "session.token": session_token, 'condition.keyword': findCode } resp = self.reqst.post(self.mydict['searchList'], data=data, proxies=self.proxies, timeout=120) if resp.status_code != 200: # print resp.status_code # print 'error...(get_id_num)' count += 1 continue else: try: soup = BeautifulSoup(resp.content, 'html.parser').find_all( 'div', attrs={'class': 'link'})[0] hrefa = soup.find('a', attrs={'target': '_blank'}) if hrefa: self.after_crack_checkcode_page = resp.content return True # return hrefa['href'].split('&')[0] else: count += 1 continue except: return None def get_re_list_from_content(self, content): m = re.search(r'investor\.invName = \"(.+)\"', content) one = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invt\.subConAm = \"(.+)\"', content) five = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invt\.conDate = [\"|\'](.+)[\"|\']', content) six = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invt\.conForm = [\"|\'](.+)[\"|\']', content) four = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invtActl\.acConAm = [\"|\'](.+)[\"|\']', content) eight = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invtActl\.conDate = [\"|\'](.+)[\"|\']', content) nigh = unicode(m.group(1), 'utf8') if m else None m = re.search(r'invtActl\.conForm = [\"|\'](.+)[\"|\']', content) seven = unicode(m.group(1), 'utf8') if m else None return [one, five, eight, four, five, six, seven, eight, nigh] pass def get_tables(self, url): resp = self.reqst.get(url, proxies=self.proxies, timeout=120) if resp.status_code == 200: return BeautifulSoup(resp.content, 'html.parser').find_all('table') #return [table for table in tables] #if (table.find_all('th') or table.find_all('a')) ] def get_head_ths_tds(self, table): head = table.find_all('th')[0].get_text().strip().split( '\n')[0].strip() allths = [ th.get_text().strip() for th in table.find_all('th')[1:] if th.get_text() ] if head == u'股东信息' or head == u'发起人信息' or head == u'股东(发起人)信息' or head == u'行政许可信息' or head == u'股权出质登记信息': tdlist = [] for td in table.find_all('td'): if td.find_all('a'): tddict = {} detail_head, detail_allths, detail_alltds = self.get_head_ths_tds( self.get_tables(td.a['href'])[0]) if detail_head == u'股东及出资信息': detail_content = self.reqst.get(td.a['href'], proxies=self.proxies, timeout=120).content detail_alltds = self.get_re_list_from_content( detail_content) # print '---------------------------', len(detail_allths[:3]+detail_allths[5:]), len(detail_alltds) # tddict = self.get_one_to_one_dict(detail_allths[:3]+detail_allths[5:], detail_alltds) detail_allths = detail_allths[:3] + detail_allths[5:] # self.test_print_all_ths_tds(detail_head, detail_allths, detail_alltds) son_need_dict = {} for key, value in zip(detail_allths[3:], detail_alltds[3:]): son_need_dict[key] = value need_dict = {} for key, value in zip(detail_allths[:3], detail_alltds[:3]): need_dict[key] = value need_dict['list'] = [son_need_dict] tdlist.append({detail_head: [need_dict]}) # tdlist.append(tddict) else: tddict = self.get_one_to_one_dict( detail_allths, detail_alltds) tdlist.append(tddict) elif td.get_text(): tdlist.append(td.get_text().strip()) else: tdlist.append(None) return head, allths, tdlist pass # elif head == u'股东及出资信息(币种与注册资本一致)' or head == u'股东及出资信息': # pass elif head == u'企业年报': tdlist = [] for td in table.find_all('td'): if td.find_all('a'): tddict = {} for i, table in enumerate(self.get_tables(td.a['href'])): enter_head, enter_allths, enter_alltds = self.get_head_ths_tds( table) #print enter_head if i == 0: enter_head = enter_allths[0] enter_allths = enter_allths[1:] #self.test_print_all_ths_tds(enter_head, enter_allths, enter_alltds) tddict[enter_head] = self.get_one_to_one_dict( enter_allths, enter_alltds) if enter_head == u'企业基本信息' or enter_head == u'企业资产状况信息': tddict[enter_head] = self.get_one_to_one_dict( enter_allths, enter_alltds)[0] tdlist.append(td.get_text().strip()) tdlist.append(tddict) elif td.get_text(): tdlist.append(td.get_text().strip()) else: tdlist.append(None) allths.insert(2, u'详情') # self.test_print_all_ths_tds(head, allths, tdlist) return head, allths, tdlist pass else: if table.find_all('td'): alltds = [ td.get_text().strip() if td.get_text() else None for td in table.find_all('td') ] else: alltds = [None for th in allths] # alltds = [] if head == u'主要人员信息': return head, allths[:int(len(allths) / 2)], alltds else: return head, allths, alltds #return (table.find_all('th')[0].get_text().strip().split('\n')[0].strip(), [th.get_text().strip() for th in table.find_all('th')[1:] if th.get_text()], [td.get_text().strip() if td.get_text() else None for td in table.find_all('td')]) def get_one_to_one_dict(self, allths, alltds): if len(allths) == len(alltds): if any(alltds): one_to_one_dict = {} for key, value in zip(allths, alltds): one_to_one_dict[key] = value return [one_to_one_dict] else: return [] else: templist = [] x = 0 y = x + len(allths) #print '---------------------%d-------------------------------%d' % (len(allth), len(alltd)) while y <= len(alltds): tempdict = {} for keys, values in zip(allths, alltds[x:y]): tempdict[keys] = values x = y y = x + len(allths) templist.append(tempdict) return templist def test_print_table(self, tables): for table in tables: print table def test_print_all_ths_tds(self, head, allths, alltds): print '--------------', head, '--------------' for th in allths: print th for td in alltds: print td def test_print_all_dict(self, mydict): for key, value in mydict.items(): print key, ':', value def get_json_one(self, mydict, tables): #self.test_print_table(tables) for table in tables: head, allths, alltds = self.get_head_ths_tds(table) #print head try: self.result_json_dict[mydict[head]] = self.get_one_to_one_dict( allths, alltds) except: pass if head == u'基本信息': self.result_json_dict[mydict[head]] = self.get_one_to_one_dict( allths, alltds)[0] if head == u'清算信息': if allths: self.result_json_dict[ mydict[head]] = self.get_one_to_one_dict( allths, alltds) else: self.result_json_dict[mydict[head]] = [] #self.test_print_all_ths_tds(head, allths, alltds) pass def get_json_two(self, mydict, tables): #self.test_print_table(tables) for table in tables: head, allths, alltds = self.get_head_ths_tds(table) #print head self.result_json_dict[mydict[head]] = self.get_one_to_one_dict( allths, alltds) pass def get_json_three(self, mydict, tables): #self.test_print_table(tables) for table in tables: head, allths, alltds = self.get_head_ths_tds(table) #print head self.result_json_dict[mydict[head]] = self.get_one_to_one_dict( allths, alltds) pass def get_json_four(self, mydict, tables): #self.test_print_table(tables) for table in tables: head, allths, alltds = self.get_head_ths_tds(table) #print head self.result_json_dict[mydict[head]] = self.get_one_to_one_dict( allths, alltds) pass def run(self, findCode): self.ent_number = findCode id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() print id_args if id_args and id_args.download_args.get('uuid'): self.result_json_dict = {} self.uuid = id_args.download_args['uuid'] tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) return [{self.ent_number: self.result_json_dict}] else: #创建目录 html_restore_path = self.json_restore_path + '/yunnan/' if not os.path.exists(html_restore_path): os.makedirs(html_restore_path) self.uuid = self.get_id_num(findCode) if self.uuid is None: return json.dumps({self.ent_number: {}}) self.result_json_dict_list = [] for div in BeautifulSoup(self.after_crack_checkcode_page, 'html.parser').find_all( 'div', attrs={'class': 'list-item'}): hrefa = div.find_all('a', attrs={'target': '_blank'})[0] if hrefa: self.uuid = hrefa['href'].split('&')[0] self.enterprise_name = div.find_all( 'div', attrs={'class': 'link'})[0].get_text().strip() self.ent_number = div.find_all( 'span')[0].get_text().strip() args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() if args: args.delete() args = CrawlerDownloadArgs( province='yunnan', register_number=self.ent_number, unifield_number=self.ent_number, enterprise_name=self.enterprise_name, download_args={'uuid': self.uuid}) args.save() else: continue print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) self.result_json_dict_list.append( {self.ent_number: self.result_json_dict}) return self.result_json_dict_list
class JiangsuCrawler(Crawler): """江苏工商公示信息网页爬虫 """ #html数据的存储路径 html_restore_path = settings.json_restore_path + '/jiangsu/' #验证码图片的存储路径 ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg' code_cracker = CaptchaRecognition('jiangsu') #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = { 'host': 'www.jsgsj.gov.cn', 'official_site': 'http://www.jsgsj.gov.cn:58888/province/', 'get_checkcode': 'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7', 'post_checkcode': 'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true', 'ind_comm_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp', 'ent_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp', 'other_dept_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp', 'judical_assist_pub_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp', 'annual_report_skeleton': 'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp', 'ci_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true', 'common_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true', 'nb_enter': 'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true', 'ci_detail': 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true' } def __init__(self, json_restore_path=None): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ self.proxies = Proxies().get_proxies() self.json_restore_path = json_restore_path self.parser = JiangsuParser(self) self.reqst = requests.Session() self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0' }) self.corp_org = '' self.corp_id = '' self.corp_seq_id = '' self.common_enter_post_data = {} self.ci_enter_post_data = {} self.nb_enter_post_data = {} self.post_info = { 'ind_comm_pub_reg_basic': { 'url_type': 'ci_enter', 'post_type': 'ci_enter', 'specificQuery': 'basicInfo' }, 'ind_comm_pub_reg_shareholder': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'investmentInfor' }, 'ind_comm_pub_reg_modify': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'biangeng' }, 'ind_comm_pub_arch_key_persons': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'personnelInformation' }, 'ind_comm_pub_arch_branch': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'branchOfficeInfor' }, #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'}, 'ind_comm_pub_movable_property_reg': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'dongchan' }, 'ind_comm_pub_equity_ownership_reg': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'guquanchuzhi' }, 'ind_comm_pub_administration_sanction': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'chufa' }, 'ind_comm_pub_business_exception': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'abnormalInfor' }, #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'}, 'ind_comm_pub_spot_check': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'checkup' }, 'ind_comm_pub_reg_shareholder_detail': { 'url_type': 'ci_detail', 'post_type': 'ci_detail', 'specificQuery': 'investorInfor' }, 'ent_pub_annual_report': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_report_list' }, 'annual_report_detail': { 'url_type': 'nb_enter', 'post_type': 'nb_enter' }, 'ent_pub_shareholder_capital_contribution': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_tzcz' }, 'ent_pub_administrative_license': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_xzxk' }, 'ent_pub_knowledge_property': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_zscq' }, 'ent_pub_administration_sanction': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_xzcf' }, 'other_dept_pub_administration_license': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xingzheng' }, 'other_dept_pub_administration_sanction': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xingzhengchufa' }, 'judical_assist_pub_equity_freeze': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'gqdjList' }, 'judical_assist_pub_shareholder_modify': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'gdbgList' } } def run(self, ent_number=0): if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) return Crawler.run(self, ent_number) ''' self.ent_number = str(ent_number) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.json_dict = {} if not self.crawl_check_page(): settings.logger.error('crack check code failed, stop to crawl enterprise %s' % self.ent_number) return False self.crawl_ind_comm_pub_pages() self.crawl_ent_pub_pages() self.crawl_other_dept_pub_pages() self.crawl_judical_assist_pub_pub_pages() #采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.json_dict}) self.write_file_mutex.release() return True ''' def crawl_check_page(self): """爬取验证码页面,包括下载验证码图片以及破解验证码 :return true or false """ resp = self.crawl_page_by_url(self.urls['official_site']) if not resp: logging.error("crawl the first page page failed!\n") return False count = 0 while count < 15: count += 1 ckcode = self.crack_checkcode() if not ckcode[1]: logging.error("crawl checkcode failed! count number = %d\n" % (count)) continue data = {'name': self.ent_number, 'verifyCode': ckcode[1]} resp = self.crawl_page_by_url_post(self.urls['post_checkcode'], data=data) if resp.find("onclick") >= 0 and self.parse_post_check_page(resp): return True else: logging.error( "crawl post check page failed! count number = %d\n" % (count)) time.sleep(random.uniform(5, 8)) return False def get_page_data(self, page_name, real_post_data=None): """获取页面数据,通过页面名称,和post_data, 江苏的页面中几乎全部都是post方式来获取数据 """ url = self.urls[self.post_info[page_name].get('url_type')] logging.info('get %s, url:\n%s\n' % (page_name, url)) if real_post_data: return self.get_pages(url, real_post_data) if self.post_info[page_name].get('post_type') == 'ci_enter': self.ci_enter_post_data['specificQuery'] = self.post_info[ page_name].get('specificQuery') post_data = self.ci_enter_post_data elif self.post_info[page_name].get( 'post_type') == 'ci_enter_with_recordline': self.ci_enter_with_record_line_post_data[ 'specificQuery'] = self.post_info[page_name].get( 'specificQuery') post_data = self.ci_enter_with_record_line_post_data elif self.post_info[page_name].get('post_type') == 'common_enter': self.common_enter_post_data['propertiesName'] = self.post_info[ page_name].get('propertiesName') post_data = self.common_enter_post_data elif self.post_info[page_name].get('post_type') == 'ci_detail': self.ci_detail_post_data['specificQuery'] = self.post_info[ page_name].get('specificQuery') post_data = self.ci_detail_post_data elif self.post_info[page_name].get('post_type') == 'nb_enter': self.nb_enter_post_data['propertiesName'] = self.post_info[ page_name].get('propertiesName') post_data = self.nb_enter_post_data return self.get_pages(url, post_data) def crawl_ind_comm_pub_pages(self): """爬取工商公示信息 """ if not self.parser.ind_comm_pub_skeleton_built: page = self.crawl_skeleton_page('ind_comm_pub_skeleton') if not page: logging.error('crawl ind comm pub skeleton failed!') return False self.parser.parse_page('ind_comm_pub_skeleton', page) for item in ( 'ind_comm_pub_reg_basic', # 登记信息-基本信息 'ind_comm_pub_reg_shareholder', # 股东信息 'ind_comm_pub_reg_modify', 'ind_comm_pub_arch_key_persons', # 备案信息-主要人员信息 'ind_comm_pub_arch_branch', # 备案信息-分支机构信息 #'ind_comm_pub_arch_liquidation', # 备案信息-清算信息, 网页中没有 'ind_comm_pub_movable_property_reg', # 动产抵押登记信息 #'ind_comm_pub_equity_ownership_reg', # 股权出置登记信息 'ind_comm_pub_administration_sanction', # 行政处罚信息 #'ind_comm_pub_business_exception', # 经营异常信息 , 网页中不存在 #'ind_comm_pub_serious_violate_law', # 严重违法信息 'ind_comm_pub_spot_check'): # 抽查检查信息 page_data = self.get_page_data(item) self.json_dict[item] = self.parser.parse_page(item, page_data) def crawl_ent_pub_pages(self): """爬取企业公示信息 """ if not self.parser.ent_pub_skeleton_built: page = self.crawl_skeleton_page('ent_pub_skeleton') if not page: logging.error('crawl ent pub skeleton failed!') return False self.parser.parse_page('ent_pub_skeleton', page) if not self.parser.annual_report_skeleton_built: page = self.crawl_skeleton_page('annual_report_skeleton') if not page: logging.error('crawl annual report skeleton failed!') return False self.parser.parse_page('annual_report_skeleton', page) for item in ( 'ent_pub_annual_report', #'ent_pub_shareholder_capital_contribution', #企业投资人出资比例 #'ent_pub_equity_change', #股权变更信息 'ent_pub_administrative_license', #行政许可信息 'ent_pub_knowledge_property', #知识产权出资登记 #'ent_pub_administration_sanction' #行政许可信息 ): page_data = self.get_page_data(item) self.json_dict[item] = self.parser.parse_page(item, page_data) def crawl_other_dept_pub_pages(self): """爬取其他部门公示信息 """ if not self.parser.other_dept_pub_skeleton_built: page = self.crawl_skeleton_page('other_dept_pub_skeleton') if not page: logging.error('crawl other dept pub skeleton failed!') return False self.parser.parse_page('other_dept_pub_skeleton', page) for item in ( 'other_dept_pub_administration_license', #行政许可信息 'other_dept_pub_administration_sanction' #行政处罚信息 ): page_data = self.get_page_data(item) self.json_dict[item] = self.parser.parse_page(item, page_data) def crawl_judical_assist_pub_pub_pages(self): """爬取司法协助信息 """ if not self.parser.judical_assist_pub_skeleton_built: page = self.crawl_skeleton_page('judical_assist_pub_skeleton') if not page: logging.error('crawl judical assist skeleton failed!') return False self.parser.parse_page('judical_assist_pub_skeleton', page) for item in ( 'judical_assist_pub_equity_freeze', #股权冻结信息 'judical_assist_pub_shareholder_modify' #股东变更信息 ): page_data = self.get_page_data(item) self.json_dict[item] = self.parser.parse_page(item, page_data) def get_pages(self, url, post_data): """获取网页数据 Args: url: url地址 post_data: post方式获取数据,返回的如果是一个列表,则将列表的所有元素都获得才返回 Returns: """ resp = self.crawl_page_by_url_post(url, data=post_data) if not resp: logging.error('get all pages of a section failed!') return else: json_obj = json.loads(resp) if type(json_obj) == dict and json_obj.get( 'total', None) and int(json_obj.get('total')) > 5: post_data['pageSize'] = json_obj.get('total') resp = self.crawl_page_by_url_post(url, data=post_data) if not resp: logging.error('get all pages of a section failed!') return return resp def crawl_skeleton_page(self, name): """爬取网页表格的框架页面,在江苏的网页中, 工商公示信息, 企业公示信息,其他部门公示信息,司法协助信息 所有的tab页面中的表格结构都在一个最开始的页面中给出 """ url = self.urls[name] post_data = { 'org': self.corp_org, 'id': self.corp_id, 'seq_id': self.corp_seq_id, 'reg_no': self.ent_number, 'name': self.ent_number, 'containContextPath': 'ecipplatform', 'corp_name': self.ent_number } resp = self.crawl_page_by_url_post(url, data=post_data) if not resp: logging.error('crawl %s page failed, error code.\n' % (name)) return False return resp def parse_post_check_page(self, page): """解析提交验证码之后的页面,提取所需要的信息,比如corp id等 Args: page: 提交验证码之后的页面 """ m = re.search( r'onclick=\\\"\w+\(\'([\w\./]+)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\'\)', page) if m: self.corp_org = m.group(2) self.corp_id = m.group(3) self.corp_seq_id = m.group(4) self.common_enter_post_data = { 'showRecordLine': '1', 'specificQuery': 'commonQuery', 'propertiesName': '', 'corp_org': self.corp_org, 'corp_id': self.corp_id, 'pageNo': '1', 'pageSize': '5' } self.ci_enter_post_data = { 'org': self.corp_org, 'id': self.corp_id, 'seq_id': self.corp_seq_id, 'specificQuery': '' } self.ci_enter_with_record_line_post_data = { 'CORP_ORG': self.corp_org, 'CORP_ID': self.corp_id, 'CORP_SEQ_ID': self.corp_seq_id, 'specificQuery': '', 'pageNo': '1', 'pageSize': '5', 'showRecordLine': '1' } self.ci_detail_post_data = { 'ORG': self.corp_org, 'ID': '', 'CORP_ORG': self.corp_org, 'CORP_ID': self.corp_id, 'SEQ_ID': '', 'REG_NO': self.ent_number, 'specificQuery': '' } self.nb_enter_post_data = { 'ID': '', 'REG_NO': self.ent_number, 'showRecordLine': '0', 'specificQuery': 'gs_pb', 'propertiesName': '', 'pageNo': '1', 'pageSize': '5', 'ADMIT_MAIN': '08' } return True return False def crack_checkcode(self): """破解验证码 :return 破解后的验证码 """ resp = self.crawl_page_by_url(self.urls['get_checkcode']) if not resp: logging.error('Failed, exception occured when getting checkcode') return ('', '') time.sleep(random.uniform(2, 4)) self.write_file_mutex.acquire() ckcode = ('', '') with open(self.ckcode_image_path, 'wb') as f: f.write(resp) try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.error('exception occured when crack checkcode') ckcode = ('', '') finally: pass self.write_file_mutex.release() return ckcode def crawl_page_by_url(self, url): """根据url直接爬取页面 """ try: resp = self.reqst.get(url, proxies=self.proxies) if resp.status_code != 200: logging.error('crawl page by url failed! url = %s' % url) page = resp.content time.sleep(random.uniform(0.2, 1)) # if saveingtml: # CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page) return page except Exception as e: logging.error("crawl page by url exception %s" % (type(e))) return None def crawl_page_by_url_post(self, url, data): """ 根据url和post数据爬取页面 """ r = self.reqst.post(url, data, proxies=self.proxies) time.sleep(random.uniform(0.2, 1)) if r.status_code != 200: logging.error( u"Getting page by url with post:%s\n, return status %s\n" % (url, r.status_code)) return False return r.content def get_annual_report_detail(self, report_year, report_id): """获取企业年报的详细信息 """ annual_report_detail = {} post_data = self.nb_enter_post_data post_data['ID'] = report_id post_data['showRecordLine'] = '0' post_data['OPERATE_TYPE'] = '2' post_data['propertiesName'] = 'query_basicInfo' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'企业基本信息'] = self.parser.parse_page( 'annual_report_ent_basic_info', page_data) annual_report_detail[u'企业资产状况信息'] = self.parser.parse_page( 'annual_report_ent_property_info', page_data) post_data['showRecordLine'] = '1' post_data['propertiesName'] = 'query_websiteInfo' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'网站或网店信息'] = self.parser.parse_page( 'annual_report_web_info', page_data) post_data['propertiesName'] = 'query_investInfo' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'对外投资信息'] = self.parser.parse_page( 'annual_report_investment_abord_info', page_data) post_data['MAIN_ID'] = report_id post_data['OPERATE_TYPE'] = '1' post_data['TYPE'] = 'NZGS' post_data['ADMIT_MAIN'] = '08' post_data['propertiesName'] = 'query_stockInfo' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'股东及出资信息'] = self.parser.parse_page( 'annual_report_shareholder_info', page_data) post_data['propertiesName'] = 'query_InformationSecurity' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'对外提供保证担保信息'] = self.parser.parse_page( 'annual_report_external_guarantee_info', page_data) post_data['propertiesName'] = 'query_RevisionRecord' page_data = self.get_page_data('annual_report_detail', post_data) annual_report_detail[u'修改记录'] = self.parser.parse_page( 'annual_report_modify_record', page_data) return annual_report_detail
class GuangdongClawer(object): #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() def __init__(self, json_restore_path=None): self.html_search = None self.html_showInfo = None self.Captcha = None self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.main_host = "" self.json_dict = {} self.json_restore_path = json_restore_path self.dir_restore_path = settings.json_restore_path + '/guangdong/' #self.json_restore_path = settings.json_restore_path + '/guangdong.json' #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/guangdong/ckcode.jpg' # 破解搜索页面 def crawl_page_search(self, url): r = self.requests.get(url) if r.status_code != 200: logging.error( u"Something wrong when getting the url:%s , status_code=%d", url, r.status_code) return r.encoding = "utf-8" #logging.error("searchpage html :\n %s", r.text) self.html_search = r.text #获得搜索结果展示页面 def get_page_showInfo(self, url, datas): r = self.requests.post(url, data=datas) if r.status_code != 200: return False r.encoding = "utf-8" #logging.error("showInfo page html :\n %s", r.text) self.html_showInfo = r.text #分析 展示页面, 获得搜索到的企业列表 def analyze_showInfo(self): if self.html_showInfo is None: logging.error(u"Getting Page ShowInfo failed\n") # call Object Analyze's method Ent = [] soup = BeautifulSoup(self.html_showInfo, "html5lib") divs = soup.find_all("div", {"class": "list"}) if divs: for div in divs: logging.error(u"div.ul.li.a['href'] = %s\n", div.ul.li.a['href']) Ent.append(div.ul.li.a['href']) self.ents = Ent # 破解验证码页面 def crawl_page_captcha(self, url_Captcha, url_CheckCode, url_showInfo, textfield='440301102739085'): count = 0 while True: count += 1 r = self.requests.get(url_Captcha) if r.status_code != 200: logging.error( u"Something wrong when getting the Captcha url:%s , status_code=%d", url_Captcha, r.status_code) return self.Captcha = r.content if self.save_captcha(): result = self.crack_captcha() #print result datas = { 'textfield': textfield, 'code': result, } response = self.get_check_response(url_CheckCode, datas) # response返回的json结果: {u'flag': u'1', u'textfield': u'H+kiIP4DWBtMJPckUI3U3Q=='} if response['flag'] == '1': datas_showInfo = { 'textfield': response['textfield'], 'code': result } self.get_page_showInfo(url_showInfo, datas_showInfo) break else: logging.error( u"crack ID: %s Captcha failed, the %d time(s)" % (self.ent_num, count)) if count > 15: logging.error( u"ID: %s, crack Captcha failed after the %d times of trial" % (textfield, count)) break time.sleep(random.uniform(1, 4)) return #获得验证的结果信息 def get_check_response(self, url, datas): r = self.requests.post(url, data=datas) if r.status_code != 200: return False #print r.json() return r.json() #调用函数,破解验证码图片并返回结果 def crack_captcha(self): if os.path.exists(self.path_captcha) is False: logging.error(u"Captcha path is not found\n") return result = self.CR.predict_result(self.path_captcha) return result[1] #print result # 保存验证码图片 def save_captcha(self): url_Captcha = self.path_captcha if self.Captcha is None: logging.error(u"Can not store Captcha: None\n") return False self.write_file_mutex.acquire() f = open(url_Captcha, 'w') try: f.write(self.Captcha) except IOError: logging.error("%s can not be written", url_Captcha) finally: f.close self.write_file_mutex.release() return True """ The following functions are for main page """ """ 1. iterate enterprises in ents 2. for each ent: decide host so that choose functions by pattern 3. for each pattern, iterate urls 4. for each url, iterate item in tabs """ def crawl_page_main(self): sub_json_dict = {} if not self.ents: logging.error(u"Get no search result\n") try: for ent in self.ents: #http://www.szcredit.com.cn/web/GSZJGSPT/ QyxyDetail.aspx?rid=acc04ef9ac0145ecb8c87dd5710c2f86 #http://gsxt.gzaic.gov.cn/search/ search!entityShow?entityVo.pripid=440100100012003051400230 #http://gsxt.gdgs.gov.cn/aiccips /GSpublicity/GSpublicityList.html?service=entInfo_+8/Z3ukM3JcWEfZvXVt+QiLPiIqemiEqqq4l7n9oAh/FI+v6zW/DL40+AV4Hja1y-dA+Hj5oOjXjQTgAhKSP1lA== #HOSTS =["www.szcredit.com.cn", "121.8.227.200:7001", "gsxt.gdgs.gov.cn/aiccips"] m = re.match('http', ent) if m is None: ent = urls['host'] + ent[3:] logging.error(u"ent url:%s\n" % ent) for i, item in enumerate(HOSTS): if ent.find(item) != -1: #"www.szcredit.com.cn" if i == 0: logging.error(u"This %s enterprise is type 0" % (self.ent_num)) guangdong = Guangdong0(self.requests, self.ent_num) sub_json_dict = guangdong.run(ent) elif i == 1: logging.error(u"This %s enterprise is type 1" % (self.ent_num)) guangdong = Guangdong1(self.requests) sub_json_dict = guangdong.run(ent) # gsxt.gdgs.gov.cn/aiccips elif i == 2: logging.error(u"This %s enterprise is type 2" % (self.ent_num)) guangdong = Guangdong2(self.requests) sub_json_dict = guangdong.run(ent) break else: logging.error(u"There are no response hosts:%s\n" % self.ent_num) except Exception as e: logging.error( u"An error ocurred when getting the main page, error: %s" % type(e)) raise e finally: return sub_json_dict def crawl_page_by_url(self, url): r = self.requests.get(url) if r.status_code != 200: logging.error(u"Getting page by url:%s\n, return status %s\n" % (url, r.status_code)) return False # 为了防止页面间接跳转,获取最终目标url return {'page': r.text, 'url': r.url} def crawl_page_by_url_post(self, url, data, header={}): if header: r = self.requests.post(url, data, headers=header) else: r = self.requests.post(url, data) if r.status_code != 200: logging.error( u"Getting page by url with post:%s\n, return status %s\n" % (url, r.status_code)) return False return {'page': r.text, 'url': r.url} # main function def run(self, ent_num): if not os.path.exists(self.dir_restore_path): os.makedirs(self.dir_restore_path) json_dict = {} self.ent_num = str(ent_num) logging.error('crawl ID: %s\n' % ent_num) self.crawl_page_search(urls['page_search']) self.crawl_page_captcha(urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num) self.analyze_showInfo() data = self.crawl_page_main() json_dict[ent_num] = data return json.dumps(json_dict)
class HebeiCrawler(object): #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() def __init__(self, json_restore_path): self.CR = CaptchaRecognition("hebei") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.json_restore_path = json_restore_path self.csrf = "" #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/hebei/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = settings.json_restore_path + '/hebei/' # 破解搜索页面 def crawl_page_search(self, url): r = self.requests.get( url) if r.status_code != 200: logging.error(u"Something wrong when getting the url:%s , status_code=%d", url, r.status_code) return r.encoding = "utf-8" #logging.debug("searchpage html :\n %s", r.text) return r.text #分析 展示页面, 获得搜索到的企业列表 def analyze_showInfo(self, page): Ent = [] soup = BeautifulSoup(page, "html5lib") divs = soup.find_all("div", {"class":"list-item"}) for div in divs: Ent.append(div.find('a')['href']) self.ents = Ent def crawl_page_captcha(self, url_search, url_Captcha, url_CheckCode,url_showInfo, textfield= '130000000021709'): """破解验证码页面""" html_search = self.crawl_page_search(url_search) if not html_search: logging.error(u"There is no search page") return soup = BeautifulSoup(html_search, 'html5lib') form = soup.find('form', {'id':'formInfo'}) datas= { #'searchType' : 1, 'captcha': None, 'session.token': form.find('input',{'name': 'session.token'})['value'], #'condition.keyword': textfield, } count = 0 while True: count+= 1 r = self.requests.get( url_Captcha+ str(random.random())) if r.status_code != 200: logging.error(u"Something wrong when getting the Captcha url:%s , status_code=%d", url_Captcha+ str(random.random()), r.status_code) return #logging.debug("Captcha page html :\n %s", self.Captcha) if self.save_captcha(r.content): logging.info("Captcha is saved successfully \n" ) datas['captcha'] = self.crack_captcha() logging.info("cracked captcha is %d"%(datas['captcha']) ) res= self.crawl_page_by_url_post(url_CheckCode, datas)['page'] # 如果验证码正确,就返回一种页面,否则返回主页面 if str(res) is not '0' : datas['searchType'] = 1 datas['condition.keyword'] = textfield page = self.crawl_page_by_url_post(url_showInfo, datas)['page'] self.analyze_showInfo(page) break else: logging.debug(u"crack Captcha failed, the %d time(s)", count) if count > 15: break return def crack_captcha(self): """调用函数,破解验证码图片并返回结果""" if os.path.exists(self.path_captcha) is False: logging.error(u"Captcha path is not found\n") return result = self.CR.predict_result(self.path_captcha) return result[1] #print result def save_captcha(self, Captcha): """保存验证码图片""" url_Captcha = self.path_captcha if Captcha is None: logging.error(u"Can not store Captcha: None\n") return False self.write_file_mutex.acquire() f = open(url_Captcha, 'w') try: f.write(Captcha) except IOError: logging.debug("%s can not be written", url_Captcha) finally: f.close self.write_file_mutex.release() return True """ The following enterprises in ents 1. for each ent: decide host so that choose e urls 2. for eah url, iterate item in tabs """ def crawl_page_main(self ): """ 爬取页面信息总函数 """ sub_json_dict= {} if not self.ents: logging.error(u"Get no search result\n") try: for ent in self.ents: m = re.match('http', ent) if m is None: ent = urls['host']+ ent logging.info(u"crawl main url:%s"% ent) #工商公示信息 url = ent sub_json_dict.update(self.crawl_ind_comm_pub_pages(url)) url = url[:-2]+"02" sub_json_dict.update(self.crawl_ent_pub_pages(url)) url = url[:-2] + "03" sub_json_dict.update(self.crawl_other_dept_pub_pages(url)) url = url[:-2] + "06" sub_json_dict.update(self.crawl_judical_assist_pub_pages(url)) except Exception as e: logging.error(u"An error ocurred when getting the main page, error: %s"% type(e)) raise e finally: return sub_json_dict #工商公式信息页面 def crawl_ind_comm_pub_pages(self, url): """ 爬取 工商公式 信息页面 """ sub_json_dict={} try: #page = html_from_file('next.html') logging.info( u"crawl the crawl_ind_comm_pub_pages page %s."%(url)) page = self.crawl_page_by_url(url)['page'] #html_to_file('next.html', page) dj = self.parse_page(page ) # class= result-table sub_json_dict['ind_comm_pub_reg_basic'] = dj[u'基本信息'] if dj.has_key(u'基本信息') else [] # 登记信息-基本信息 sub_json_dict['ind_comm_pub_reg_shareholder'] =dj[u'股东信息'] if dj.has_key(u'股东信息') else [] # 股东信息 sub_json_dict['ind_comm_pub_reg_modify'] = dj[u'变更信息'] if dj.has_key(u'变更信息') else [] # 变更信息 sub_json_dict['ind_comm_pub_arch_key_persons'] = dj[u'主要人员信息'] if dj.has_key(u'主要人员信息') else [] # 备案信息-主要人员信息 sub_json_dict['ind_comm_pub_arch_branch'] = dj[u'分支机构信息'] if dj.has_key(u'分支机构信息') else [] # 备案信息-分支机构信息 sub_json_dict['ind_comm_pub_arch_liquidation'] = dj[u'清算信息'] if dj.has_key(u'清算信息') else [] # 备案信息-清算信息 sub_json_dict['ind_comm_pub_movable_property_reg'] = dj[u'动产抵押登记信息'] if dj.has_key(u'动产抵押登记信息') else [] sub_json_dict['ind_comm_pub_equity_ownership_reg'] = dj[u'股权出质登记信息'] if dj.has_key(u'股权出质登记信息') else [] sub_json_dict['ind_comm_pub_administration_sanction'] = dj[u'行政处罚信息'] if dj.has_key(u'行政处罚信息') else [] sub_json_dict['ind_comm_pub_business_exception'] = dj[u'经营异常信息'] if dj.has_key(u'经营异常信息') else [] sub_json_dict['ind_comm_pub_serious_violate_law'] = dj[u'严重违法信息'] if dj.has_key(u'严重违法信息') else [] sub_json_dict['ind_comm_pub_spot_check'] = dj[u'抽查检查信息'] if dj.has_key(u'抽查检查信息') else [] except Exception as e: logging.debug(u"An error ocurred in crawl_ind_comm_pub_pages: %s"% type(e)) raise e finally: return sub_json_dict #爬取 企业公示信息 页面 def crawl_ent_pub_pages(self, url): """ 爬取 企业公示信息 信息页面 """ sub_json_dict = {} try: logging.info( u"crawl the crawl_ent_pub_pages page %s"%(url)) page = self.crawl_page_by_url(url)['page'] #html_to_file('next.html', page) #page = html_from_file('next.html') p = self.parse_page(page) sub_json_dict['ent_pub_ent_annual_report'] = p[u'企业年报'] if p.has_key(u'企业年报') else [] sub_json_dict['ent_pub_administration_license'] = p[u'行政许可信息'] if p.has_key(u'行政许可信息') else [] sub_json_dict['ent_pub_administration_sanction'] = p[u'行政处罚信息'] if p.has_key(u'行政处罚信息') else [] sub_json_dict['ent_pub_shareholder_capital_contribution'] = p[u'股东及出资信息(币种与注册资本一致)'] if p.has_key(u'股东及出资信息(币种与注册资本一致)') else [] sub_json_dict['ent_pub_reg_modify'] = p[u'变更信息'] if p.has_key(u'变更信息') else [] sub_json_dict['ent_pub_equity_change'] = p[u'股权变更信息'] if p.has_key(u'股权变更信息') else [] sub_json_dict['ent_pub_knowledge_property'] = p[u'知识产权出质登记信息'] if p.has_key(u'知识产权出质登记信息') else [] except Exception as e: logging.debug(u"An error ocurred in crawl_ent_pub_pages: %s"% type(e)) raise e finally: return sub_json_dict #爬取 其他部门公示 页面 def crawl_other_dept_pub_pages(self, url): """ 爬取其他部门信息页面 """ sub_json_dict = {} try: logging.info( u"crawl the crawl_other_dept_pub_pages page %s."%(url)) page = self.crawl_page_by_url(url)['page'] #html_to_file('next.html', page) #page = html_from_file('next.html') xk = self.parse_page(page)#行政许可信息 sub_json_dict["other_dept_pub_administration_license"] = xk[u'行政许可信息'] if xk.has_key(u'行政许可信息') else [] sub_json_dict["other_dept_pub_administration_sanction"] = xk[u'行政处罚信息'] if xk.has_key(u'行政处罚信息') else [] # 行政处罚信息 except Exception as e: logging.debug(u"An error ocurred in crawl_other_dept_pub_pages: %s"% (type(e))) raise e finally: return sub_json_dict def crawl_judical_assist_pub_pages(self, url): """爬取司法协助信息页面 """ sub_json_dict = {} try: logging.info( u"crawl the crawl_judical_assist_pub_pages page %s."%(url)) page = self.crawl_page_by_url(url)['page'] #page = html_from_file('next.html') #html_to_file('next.html', page) xz = self.parse_page(page) sub_json_dict['judical_assist_pub_equity_freeze'] = xz[u'司法股权冻结信息'] if xz.has_key(u'司法股权冻结信息') else [] sub_json_dict['judical_assist_pub_shareholder_modify'] = xz[u'司法股东变更登记信息'] if xz.has_key(u'司法股东变更登记信息') else [] except Exception as e: logging.debug(u"An error ocurred in crawl_judical_assist_pub_pages: %s"% (type(e))) raise e finally: return sub_json_dict # 出资方式字典 def dicInvtType(self, types): if types == "1": return "货币" if types == "2": return "实物" if types == "3": return "知识产权" if types == "4": return "债权" if types == "5": return "高新技术成果" if types == "6": return "土地使用权" if types == "7": return "股权" if types == "8": return "劳务" if types == "9": return "其他" def get_raw_text_by_tag(self, tag): return tag.get_text().strip() def get_table_title(self, table_tag): if table_tag.find('tr'): if table_tag.find('tr').find_all('th') : if len(table_tag.find('tr').find_all('th')) > 1 : return None # 处理 <th> aa<span> bb</span> </th> if table_tag.find('tr').th.stirng == None and len(table_tag.find('tr').th.contents) > 1: # 处理 <th> <span> bb</span> </th> 包含空格的 if (table_tag.find('tr').th.contents[0]).strip() : return (table_tag.find('tr').th.contents[0]).strip() # <th><span> bb</span> </th> return self.get_raw_text_by_tag(table_tag.find('tr').th) return None def sub_column_count(self, th_tag): if th_tag.has_attr('colspan') and th_tag.get('colspan') > 1: return int(th_tag.get('colspan')) return 0 def get_sub_columns(self, tr_tag, index, count): columns = [] for i in range(index, index + count): th = tr_tag.find_all('th')[i] if not self.sub_column_count(th): columns.append(( self.get_raw_text_by_tag(th), self.get_raw_text_by_tag(th))) else: #if has sub-sub columns columns.append((self.get_raw_text_by_tag(th), self.get_sub_columns(tr_tag.nextSibling.nextSibling, 0, self.sub_column_count(th)))) return columns #get column data recursively, use recursive because there may be table in table def get_column_data(self, columns, td_tag): if type(columns) == list: data = {} multi_col_tag = td_tag if td_tag.find('table'): multi_col_tag = td_tag.find('table').find('tr') if not multi_col_tag: logging.error('invalid multi_col_tag, multi_col_tag = %s', multi_col_tag) return data if len(columns) != len(multi_col_tag.find_all('td', recursive=False)): logging.error('column head size != column data size, columns head = %s, columns data = %s' % (columns, multi_col_tag.contents)) return data for id, col in enumerate(columns): data[col[0]] = self.get_column_data(col[1], multi_col_tag.find_all('td', recursive=False)[id]) return data else: return self.get_raw_text_by_tag(td_tag) def get_detail_link(self, bs4_tag): if bs4_tag.has_attr('href') and (bs4_tag['href'] != '#' and bs4_tag['href'] != 'javascript:void(0);'): pattern = re.compile(r'http') if pattern.search(bs4_tag['href']): return bs4_tag['href'] return urls['webroot'] + bs4_tag['href'] elif bs4_tag.has_attr('onclick'): #print 'onclick' logging.error(u"onclick attr was found in detail link") return None def get_columns_of_record_table(self, bs_table, page, table_name): tbody = None if len(bs_table.find_all('tbody')) > 1: tbody= bs_table.find_all('tbody')[0] else: tbody = bs_table.find('tbody') or BeautifulSoup(page, 'html5lib').find('tbody') tr = None if tbody: if len(tbody.find_all('tr')) <= 1: tr = tbody.find('tr') else: tr = tbody.find_all('tr')[1] if not tr.find('th'): tr = tbody.find_all('tr')[0] elif tr.find('td'): tr = None else: if len(bs_table.find_all('tr')) <= 1: return None elif bs_table.find_all('tr')[0].find('th') and not bs_table.find_all('tr')[0].find('td') and len(bs_table.find_all('tr')[0].find_all('th')) > 1: tr = bs_table.find_all('tr')[0] elif bs_table.find_all('tr')[1].find('th') and not bs_table.find_all('tr')[1].find('td') and len(bs_table.find_all('tr')[1].find_all('th')) > 1: tr = bs_table.find_all('tr')[1] ret_val= self.get_record_table_columns_by_tr(tr, table_name) #logging.debug(u"ret_val->%s\n", ret_val) return ret_val def get_record_table_columns_by_tr(self, tr_tag, table_name): columns = [] if not tr_tag: return columns try: sub_col_index = 0 if len(tr_tag.find_all('th'))==0 : logging.error(u"The table %s has no columns"% table_name) return columns count = 0 if len(tr_tag.find_all('th'))>0 : for th in tr_tag.find_all('th'): #logging.debug(u"th in get_record_table_columns_by_tr =\n %s", th) col_name = self.get_raw_text_by_tag(th) if col_name : if ((col_name, col_name) in columns) : col_name= col_name+'_' count+=1 if not self.sub_column_count(th): columns.append((col_name, col_name)) else: #has sub_columns columns.append((col_name, self.get_sub_columns(tr_tag.nextSibling.nextSibling, sub_col_index, self.sub_column_count(th)))) sub_col_index += self.sub_column_count(th) if count == len(tr_tag.find_all('th'))/2: columns= columns[: len(columns)/2] except Exception as e: logging.error(u'exception occured in get_table_columns, except_type = %s, table_name = %s' % (type(e), table_name)) finally: return columns # 分析企业年报详细页面 def parse_ent_pub_annual_report_page(self, page): sub_dict = {} try: soup = BeautifulSoup(page, 'html5lib') # 基本信息表包含两个表头, 需要单独处理 basic_table = soup.find('table') trs = basic_table.find_all('tr') title = self.get_raw_text_by_tag(trs[1].th) table_dict = {} for tr in trs[2:]: if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') if len(ths) != len(tds): logging.error(u'th size not equals td size in table %s, what\'s up??' % table_name) return else: for i in range(len(ths)): if self.get_raw_text_by_tag(ths[i]): table_dict[self.get_raw_text_by_tag(ths[i])] = self.get_raw_text_by_tag(tds[i]) sub_dict[title] = table_dict content_table = soup.find_all('table')[1:] for table in content_table: table_name = self.get_table_title(table) if table_name: sub_dict[table_name] = self.parse_table(table, table_name, page) except Exception as e: logging.error(u'annual page: fail to get table data with exception %s' % e) raise e finally: return sub_dict #股东及出资信息(币种与注册资本一致) def parse_table_qygs_gudongchuzi(self, page): coms = re.findall(r'var investor.*?list.push\(investor\);', page, flags=re.DOTALL+re.MULTILINE) sub_item={} item = {} Item = [] for comstr in coms: m_invstr = re.compile(r'investor.inv.*?;').search(comstr) if m_invstr: invstr = m_invstr.group() inv = re.compile(r'\".*?\"').search(invstr).group().strip('\"') #认缴 rjSubConAmlist=[] count_rj = 0 for itemstr in re.findall(r'invt.subConAm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): subConAm = eval(re.compile(r"\".*?\"").search(itemstr).group().strip('\"')) count_rj += subConAm rjSubConAmlist.append(subConAm) rjconDateList=[] for itemstr in re.findall(r'invt.conDate.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): conDate = (re.compile(r"\'.*?\'").search(itemstr).group().strip("\'")) rjconDateList.append(conDate) rjconFormList=[] for itemstr in re.findall(r'invt.conForm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): conForm = (re.compile(r"\".*?\"").search(itemstr).group().strip('\"')) rjconFormList.append(conForm) #实缴 sjAcConAm=[] count_sj = 0 for itemstr in re.findall(r'invtActl.acConAm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): acConAm = eval(re.compile(r"\".*?\"").search(itemstr).group().strip('\"')) count_sj += acConAm sjAcConAm.append(acConAm) sjconDateList = [] for itemstr in re.findall(r'invtActl.conDate.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): conDate = (re.compile(r"\'.*?\'").search(itemstr).group().strip("\'")) sjconDateList.append(conDate) sjconFormList = [] for itemstr in re.findall(r'invtActl.conForm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE): conForm = (re.compile(r"\".*?\"").search(itemstr).group().strip('\"')) sjconFormList.append(conForm) len_rj = len(rjSubConAmlist) len_sj = len(sjAcConAm) item = {} item_list = [] item[u'股东'] = inv item[u'认缴额(万元)'] = count_rj item[u'实缴额(万元)'] = count_sj try: maxRow = max( len_sj, len_rj) for i in xrange(maxRow): sub_item={} if i < len_rj: sub_item[u'认缴出资方式'] = self.dicInvtType(rjconFormList[i]) sub_item[u'认缴出资额(万元)'] = rjSubConAmlist[i] sub_item[u'认缴出资日期'] = rjconDateList[i] else: sub_item[u'认缴出资方式'] ="" sub_item[u'认缴出资额(万元)'] ="" sub_item[u'认缴出资日期'] = "" #item[u'认缴明细'] = sub_item if i< len_sj: sub_item[u'实缴出资方式'] = self.dicInvtType(sjconFormList[i]) sub_item[u'实缴出资额(万元'] = sjAcConAm[i] sub_item[u'实缴出资日期'] = sjconDateList[i] else: sub_item[u'实缴出资方式'] ="" sub_item[u'实缴出资额(万元'] ="" sub_item[u'实缴出资日期'] = "" #item[u'实缴明细'] = sub_item item_list.append(sub_item) except Exception as e: logging.error(u"exception : %s"%(type(e))) item[u'详情'] = item_list Item.append(item) else: logging.error(u"There is no company, continue!") return Item def parse_page(self, page, div_id='cont-r-b'): soup = BeautifulSoup(page, 'html5lib') page_data = {} try: div = soup.find('div', attrs = {'id':div_id}) if div: tables = div.find_all('table') else: tables = soup.find_all('table') #print table for table in tables: table_name = self.get_table_title(table) if table_name: if table_name == u"股东及出资信息(币种与注册资本一致)": page_data[table_name ] =self.parse_table_qygs_gudongchuzi(page) else: page_data[table_name] = self.parse_table(table, table_name, page) except Exception as e: logging.error(u'parse page failed, with exception %s' % e) raise e finally: return page_data def parse_table(self, bs_table, table_name, page): table_dict = None try: # tb_title = self.get_table_title(bs_table) #this is a f*****g dog case, we can't find tbody-tag in table-tag, but we can see tbody-tag in table-tag #in case of that, we use the whole html page to locate the tbody print table_name columns = self.get_columns_of_record_table(bs_table, page, table_name) #print columns tbody = None if len(bs_table.find_all('tbody'))>1: tbody = bs_table.find_all('tbody')[1] else: tbody = bs_table.find('tbody') or BeautifulSoup(page, 'html5lib').find('tbody') if columns: col_span = 0 single_col = 0 for col in columns: if type(col[1]) == list: col_span += len(col[1]) else: single_col+=1 col_span += 1 column_size = len(columns) item_array = [] if not tbody: records_tag = bs_table else: records_tag = tbody item = None for tr in records_tag.find_all('tr'): if tr.find_all('td') and len(tr.find_all('td', recursive=False)) % column_size == 0: col_count = 0 item = {} for td in tr.find_all('td',recursive=False): if td.find('a'): #try to retrieve detail link from page next_url = self.get_detail_link(td.find('a')) logging.info(u'crawl detail url: %s'% next_url) if next_url: detail_page = self.crawl_page_by_url(next_url) #html_to_file("test.html", detail_page['page']) #print "table_name : "+ table_name if table_name == u'企业年报': #logging.debug(u"next_url = %s, table_name= %s\n", detail_page['url'], table_name) page_data = self.parse_ent_pub_annual_report_page(detail_page['page']) item[columns[col_count][0]] = page_data #this may be a detail page data else: page_data = self.parse_page(detail_page['page']) item[columns[col_count][0]] = page_data #this may be a detail page data else: #item[columns[col_count]] = CrawlerUtils.get_raw_text_in_bstag(td) item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td) else: item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td) col_count += 1 if col_count == column_size: item_array.append(item.copy()) col_count = 0 #this case is for the ind-comm-pub-reg-shareholders----details'table #a f*****g dog case!!!!!! elif tr.find_all('td') and len(tr.find_all('td', recursive=False)) == col_span and col_span != column_size: col_count = 0 sub_col_index = 0 item = {} sub_item = {} for td in tr.find_all('td',recursive=False): if type(columns[col_count][1]) == list: sub_key = columns[col_count][1][sub_col_index][1] sub_item[sub_key] = self.get_raw_text_by_tag(td) sub_col_index += 1 if sub_col_index == len(columns[col_count][1]): item[columns[col_count][0]] = sub_item.copy() sub_item = {} col_count += 1 sub_col_index = 0 else: item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td) col_count += 1 if col_count == column_size: item_array.append(item.copy()) col_count = 0 table_dict = item_array else: table_dict = {} for tr in bs_table.find_all('tr'): if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') if len(ths) != len(tds): logging.error(u'th size not equals td size in table %s, what\'s up??' % table_name) return else: for i in range(len(ths)): if self.get_raw_text_by_tag(ths[i]): table_dict[self.get_raw_text_by_tag(ths[i])] = self.get_raw_text_by_tag(tds[i]) except Exception as e: logging.error(u'parse table %s failed with exception %s' % (table_name, type(e))) raise e finally: return table_dict def crawl_page_by_url(self, url): try: r = self.requests.get( url) if r.status_code != 200: logging.error(u"Getting page by url:%s, return status %s\n"% (url, r.status_code)) text = r.text urls = r.url # 为了防止页面间接跳转,获取最终目标url except Exception as e: logging.error(u"Cann't get page by url:%s, exception is %s"%(url, type(e))) finally: return {'page' : text, 'url': urls} def crawl_page_by_url_post(self, url, data, headers={}): try: if headers: self.requests.headers.update(headers) r = self.requests.post(url, data) else : r = self.requests.post(url, data) if r.status_code != 200: logging.error(u"Getting page by url with post:%s, return status %s\n"% (url, r.status_code)) text = r.text urls = r.url except Exception as e: logging.error(u"Cann't post page by url:%s, exception is %s"%(url, type(e))) finally: return {'page': text, 'url': urls} def run(self, ent_num): if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) json_dict = {} self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num) data = self.crawl_page_main() json_dict[ent_num] = data #json_dump_to_file(self.json_restore_path , json_dict) #2016-2-16 return json.dumps(json_dict) def work(self, ent_num= ""): # if not os.path.exists(self.html_restore_path): # os.makedirs(self.html_restore_path) self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num) data = self.crawl_page_main() json_dump_to_file('hebei_json.json', data)
class ZongjuCrawler(Crawler): """总局工商爬虫 """ code_cracker = CaptchaRecognition('zongju') # 多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = {'host': 'http://qyxy.saic.gov.cn', 'official_site': 'http://gsxt.saic.gov.cn/zjgs/', 'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=', 'post_checkcode': 'http://gsxt.saic.gov.cn/zjgs/security/verify_captcha', 'get_info_entry': 'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list', # 获得企业入口 'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?', # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...) } def __init__(self, json_restore_path=None): super(ZongjuCrawler, self).__init__() self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/zongju/' # 验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/zongju/ckcode.jpg' self.parser = ZongjuParser(self) self.proxies = get_proxy('beijing') self.timeout = (30, 20) def run(self, _ent): """爬取的主函数 """ # self.proxies = {'http':'http://123.121.30.123:8118'} if self.proxies: print self.proxies self.reqst.proxies = self.proxies if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) return Crawler.run(self, _ent) def crawl_check_page(self): """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交 """ count = 0 next_url = self.urls['official_site'] resp = self.reqst.get(next_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('failed to get official site') return False if not self.parse_pre_check_page(resp.content): logging.error('failed to parse pre check page') return False while count < 30: count += 1 ckcode = self.crack_checkcode() if not ckcode[1]: continue post_data = {'captcha': ckcode[1], 'session.token': self.session_token} next_url = self.urls['post_checkcode'] resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('failed to get crackcode image by url %s, fail count = %d' % (next_url, count)) continue logging.error('crack code = %s, %s, response = %s' % (ckcode[0], ckcode[1], resp.content)) if resp.content == '0': logging.error('crack checkcode failed!count = %d' % (count)) continue next_url = self.urls['get_info_entry'] post_data = { 'searchType': '1', 'captcha': ckcode[1], 'session.token': self.session_token, 'condition.keyword': self._ent } resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout) if resp.status_code != 200: logging.error('faild to crawl url %s' % next_url) return False if self.parse_post_check_page(resp.content): return True logging.error('crack checkcode failed, total fail count = %d' % count) print('crack checkcode failed!count = %d' % (count)) time.sleep(random.uniform(1, 3)) return False @exe_time def crawl_ind_comm_pub_pages(self, *args, **kwargs): """爬取工商公示信息页面 在总局的网站中,工商公示信息在一个页面中返回。页面中包含了多个表格,调用 Parser的 parse_ind_comm_page进行解析 在 Parser的ind_comm_pub_page 中,访问并设置 crawler中的 json_dict。 """ if not len(args): return url = args[0] m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url) if m: self.uuid = m.group(1) next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=01' resp = self.reqst.get(next_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('get ind comm pub info failed!') return False self.parser.parse_ind_comm_pub_pages(resp.content) @exe_time def crawl_ent_pub_pages(self, *args, **kwargs): """爬取企业公示信息页面 """ if not len(args): return url = args[0] m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url) if m: self.uuid = m.group(1) next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=02' resp = self.reqst.get(next_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('get ent pub info failed!') return False self.parser.parse_ent_pub_pages(resp.content) @exe_time def crawl_other_dept_pub_pages(self, *args, **kwargs): """爬取其他部门公示信息页面 """ if not len(args): return url = args[0] m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url) if m: self.uuid = m.group(1) next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=03' resp = self.reqst.get(next_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('get other dept pub info failed!') return False self.parser.parse_other_dept_pub_pages(resp.content) @exe_time def crawl_judical_assist_pub_pages(self, *args, **kwargs): """爬取司法协助信息页面 """ if not len(args): return url = args[0] m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url) if m: self.uuid = m.group(1) next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=06' resp = self.reqst.get(next_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('get judical assist info failed!') return False self.parser.parse_judical_assist_pub_pages(resp.content) def parse_post_check_page(self, page): """解析提交验证码之后的页面,获取必要的信息 """ soup = BeautifulSoup(page, 'html5lib') divs = soup.find_all('div', attrs={'class': 'list-item'}) if divs: Ent = {} count = 0 for div in divs: count += 1 link = div.find('div', attrs={'class': 'link'}) profile = div.find('div', attrs={'class': 'profile'}) url = "" ent = "" if link and link.find('a') and link.find('a').has_attr('href'): url = link.find('a')['href'] if profile and profile.span: ent = profile.span.get_text().strip() name = link.find('a').get_text().strip() if name == self._ent: Ent.clear() Ent[ent] = url break if count == 3: break Ent[ent] = url self.ents = Ent return True else: return False # div_tag = soup.find('div', attrs={'class': 'link'}) # if not div_tag: # return False # open_info_url = div_tag.find('a').get('href') # m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', open_info_url) # if m: # self.uuid = m.group(1) # return True # else: # return False def parse_pre_check_page(self, page): """解析提交验证码之前的页面 """ soup = BeautifulSoup(page, 'html.parser') input_tag = soup.find('input', attrs={'type': 'hidden', 'name': 'session.token'}) if input_tag: self.session_token = input_tag.get('value') return True return False def crawl_page_by_url(self, url): """通过url直接获取页面 """ resp = self.reqst.get(url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('failed to crawl page by url' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) # if saveingtml: # CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page) return page def crack_checkcode(self): """破解验证码""" checkcode_url = self.urls['get_checkcode'] + '&ra=' + str(random.random()) ckcode = ('', '') resp = self.reqst.get(checkcode_url, timeout=self.timeout, verify=False) if resp.status_code != 200: logging.error('failed to get checkcode img') return ckcode page = resp.content time.sleep(random.uniform(1, 2)) self.write_file_mutex.acquire() with open(self.ckcode_image_path, 'wb') as f: f.write(page) if not self.code_cracker: logging.error('invalid code cracker with ckcode= None') return ckcode try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.error('exception occured when crack checkcode') ckcode = ('', '') os.remove(self.ckcode_image_path) finally: pass self.write_file_mutex.release() return ckcode
class HainanCrawler(object): #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() def __init__(self, json_restore_path): self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) self.ents = [] self.main_host = "" self.json_dict = {} self.json_restore_path = json_restore_path self.html_restore_path = settings.json_restore_path + '/hainan/' #验证码图片的存储路径 self.path_captcha = settings.json_restore_path + '/hainan/ckcode.png' #分析 展示页面, 获得搜索到的企业列表 def analyze_showInfo(self, page): if page is None: logging.error(u"Getting Page ShowInfo failed\n") return Ent = [] soup = BeautifulSoup(page, "html5lib") divs = soup.find_all("div", {"class": "list"}) if divs: for div in divs: if div.find('a') and div.find('a').has_attr('href'): Ent.append(div.find('a')['href']) else: return False self.ents = Ent return True # 破解验证码页面 def crawl_page_captcha(self, url_search, url_Captcha, url_CheckCode, url_showInfo, textfield='460000000265072'): r = self.requests.get(url_search) if r.status_code != 200: logging.error( u"Something wrong when getting the url:%s , status_code=%d", url, r.status_code) return count = 0 while True: count += 1 r = self.requests.get(url_Captcha) if r.status_code != 200: logging.error( u"Something wrong when getting the Captcha url:%s , status_code=%d", url_Captcha, r.status_code) continue if self.save_captcha(r.content): result = self.crack_captcha() print result datas = { 'textfield': textfield, 'code': result, } response = json.loads( self.crawl_page_by_url_post(url_CheckCode, datas)['page']) # response返回的json结果: {u'flag': u'1', u'textfield': u'H+kiIP4DWBtMJPckUI3U3Q=='} if response['flag'] == "1": datas_showInfo = { 'textfield': response['textfield'], 'code': result } page_showInfo = self.crawl_page_by_url_post( url_showInfo, datas_showInfo)['page'] if self.analyze_showInfo(page_showInfo): break else: logging.debug(u"crack Captcha failed, the %d time(s)", count) if count > 40: break return #调用函数,破解验证码图片并返回结果 def crack_captcha(self): if os.path.exists(self.path_captcha) is False: logging.error(u"Captcha path is not found\n") return result = self.CR.predict_result(self.path_captcha) return result[1] # 保存验证码图片 def save_captcha(self, Captcha): url_Captcha = self.path_captcha if Captcha is None: logging.error(u"Can not store Captcha: None\n") return False self.write_file_mutex.acquire() f = open(url_Captcha, 'w') try: f.write(Captcha) except IOError: logging.debug("%s can not be written", url_Captcha) finally: f.close self.write_file_mutex.release() return True def parse_page_data_2(self, page): data = { "aiccipsUrl": "", "entNo": "", "entType": "", "regOrg": "", } try: soup = BeautifulSoup(page, "html5lib") data['aiccipsUrl'] = soup.find("input", {"id": "aiccipsUrl"})['value'] data['entNo'] = soup.find("input", {"id": "entNo"})['value'] data['entType'] = soup.find( "input", {"id": "entType"})['value'].strip() #+"++" data['regOrg'] = soup.find("input", {"id": "regOrg"})['value'] except Exception as e: logging.error(u"parse page failed in function parse_page_data_2\n") raise e finally: return data def crawl_page_main(self): sub_json_dict = {} if not self.ents: logging.error(u"Get no search result\n") try: for ent in self.ents: m = re.match('http', ent) if m is None: ent = urls['host'] + ent[3:] logging.debug(u"ent url:%s\n" % ent) url = ent page_entInfo = self.crawl_page_by_url(url)['page'] post_data = self.parse_page_data_2(page_entInfo) sub_json_dict.update( self.crawl_ind_comm_pub_pages(url, post_data)) url = "http://aic.hainan.gov.cn:1888/aiccips/BusinessAnnals/BusinessAnnalsList.html" sub_json_dict.update(self.crawl_ent_pub_pages(url, post_data)) url = "http://aic.hainan.gov.cn:1888/aiccips/OtherPublicity/environmentalProtection.html" sub_json_dict.update( self.crawl_other_dept_pub_pages(url, post_data)) url = "http://aic.hainan.gov.cn:1888/aiccips/judiciaryAssist/judiciaryAssistInit.html" sub_json_dict.update( self.crawl_judical_assist_pub_pages(url, post_data)) except Exception as e: logging.error( u"An error ocurred when getting the main page, error: %s" % type(e)) raise e finally: return sub_json_dict # 爬取 工商公示信息 页面 def crawl_ind_comm_pub_pages(self, url, post_data={}): sub_json_dict = {} try: tabs = ( 'entInfo', # 登记信息 'curStoPleInfo', #股权出质 'entCheckInfo', #备案信息 'pleInfo', #动产抵押登记信息 'cipPenaltyInfo', #行政处罚 'cipUnuDirInfo', #经营异常 'cipBlackInfo', #严重违法 'cipSpotCheInfo', #抽查检查 ) div_names = ( 'jibenxinxi', 'guquanchuzhi', 'beian', 'dongchandiya', 'xingzhengchufa', 'jingyingyichang', 'yanzhongweifa', 'chouchajiancha', ) for tab, div_name in zip(tabs, div_names): #url = "http://http://aic.hainan.gov.cn:1888/aiccips/GSpublicity/GSpublicityList.html?service=" + tab url = urls['prefix_GSpublicity'] + tab page = self.crawl_page_by_url_post(url, post_data)['page'] if div_name == 'jibenxinxi': dict_jiben = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_reg_modify'] = dict_jiben[ u'变更信息'] if dict_jiben.has_key(u"变更信息") else {} sub_json_dict['ind_comm_pub_reg_basic'] = dict_jiben[ u'基本信息'] if dict_jiben.has_key(u"基本信息") else [] sub_json_dict['ind_comm_pub_reg_shareholder'] = dict_jiben[ u'股东信息'] if dict_jiben.has_key(u"股东信息") else [] elif div_name == 'beian': dict_beian = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_arch_key_persons'] = dict_beian[ u'主要人员信息'] if dict_beian.has_key(u"主要人员信息") else [] sub_json_dict['ind_comm_pub_arch_branch'] = dict_beian[ u'分支机构信息'] if dict_beian.has_key(u"分支机构信息") else [] sub_json_dict[ 'ind_comm_pub_arch_liquidation'] = dict_beian[ u"清算信息"] if dict_beian.has_key(u'清算信息') else [] elif div_name == 'guquanchuzhi': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_equity_ownership_reg'] = dj[ u'股权出质登记信息'] if dj.has_key(u'股权出质登记信息') else [] elif div_name == 'dongchandiya': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_movable_property_reg'] = dj[ u'动产抵押信息'] if dj.has_key(u'动产抵押信息') else [] elif div_name == 'xingzhengchufa': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_administration_sanction'] = dj[ u'行政处罚信息'] if dj.has_key(u'行政处罚信息') else [] elif div_name == 'jingyingyichang': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_business_exception'] = dj[ u'经营异常信息'] if dj.has_key(u'经营异常信息') else [] elif div_name == 'yanzhongweifa': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_serious_violate_law'] = dj[ u'严重违法信息'] if dj.has_key(u'严重违法信息') else [] elif div_name == 'chouchajiancha': dj = self.parse_page_2(page, div_name, post_data) sub_json_dict['ind_comm_pub_spot_check'] = dj[ u'抽查检查信息'] if dj.has_key(u'抽查检查信息') else [] except Exception as e: logging.debug(u"An error ocurred in crawl_ind_comm_pub_pages: %s" % (type(e))) raise e finally: return sub_json_dict #爬取 企业公示信息 页面 def crawl_ent_pub_pages(self, url, post_data={}): sub_json_dict = {} try: page = self.crawl_page_by_url_post( urls['host'] + "/BusinessAnnals/BusinessAnnalsList.html", post_data)['page'] p = self.parse_page_2(page, 'qiyenianbao', post_data) sub_json_dict['ent_pub_ent_annual_report'] = p[ u'qiyenianbao'] if p.has_key(u'qiyenianbao') else [] page = self.crawl_page_by_url_post( urls['host'] + "/AppPerInformation.html", post_data)['page'] p = self.parse_page_2(page, 'appPer', post_data) sub_json_dict['ent_pub_administration_license'] = p[ u'行政许可情况'] if p.has_key(u'行政许可情况') else [] page = self.crawl_page_by_url_post( urls['host'] + "/XZPunishmentMsg.html", post_data)['page'] p = self.parse_page_2(page, 'xzpun', post_data) sub_json_dict['ent_pub_administration_sanction'] = p[ u'行政处罚情况'] if p.has_key(u'行政处罚情况') else [] page = self.crawl_page_by_url_post( urls['host'] + "/ContributionCapitalMsg.html", post_data)['page'] p = self.parse_page_2(page, 'sifapanding', post_data) sub_json_dict['ent_pub_shareholder_capital_contribution'] = p[ u'股东及出资信息'] if p.has_key(u'股东及出资信息') else [] sub_json_dict['ent_pub_reg_modify'] = p[u'变更信息'] if p.has_key( u'变更信息') else [] page = self.crawl_page_by_url_post( urls['host'] + "/GDGQTransferMsg/shareholderTransferMsg.html", post_data)['page'] p = self.parse_page_2(page, 'guquanbiangeng', post_data) sub_json_dict['ent_pub_equity_change'] = p[u'股权变更信息'] if p.has_key( u'股权变更信息') else [] page = self.crawl_page_by_url_post( urls['host'] + "/intPropertyMsg.html", post_data)['page'] p = self.parse_page_2(page, 'inproper', post_data) sub_json_dict['ent_pub_knowledge_property'] = p[ u'知识产权出质登记信息'] if p.has_key(u'知识产权出质登记信息') else [] except Exception as e: logging.debug(u"An error ocurred in crawl_ent_pub_pages: %s" % (type(e))) raise e finally: return sub_json_dict #json_dump_to_file("json_dict.json", self.json_dict) #爬取 其他部门公示信息 页面 def crawl_other_dept_pub_pages(self, url, post_data={}): sub_json_dict = {} try: page = self.crawl_page_by_url_post( urls['host'] + "/OtherPublicity/environmentalProtection.html", post_data)['page'] xk = self.parse_page_2(page, "xzxk", post_data) sub_json_dict["other_dept_pub_administration_license"] = xk[ u'行政许可信息'] if xk.has_key(u'行政许可信息') else [] page = self.crawl_page_by_url_post( urls['host'] + "/OtherPublicity/environmentalProtection.html", post_data)['page'] xk = self.parse_page_2(page, "czcf", post_data) sub_json_dict["other_dept_pub_administration_sanction"] = xk[ u'行政处罚信息'] if xk.has_key(u'行政处罚信息') else [] # 行政处罚信息 except Exception as e: logging.debug( u"An error ocurred in crawl_other_dept_pub_pages: %s" % (type(e))) raise e finally: return sub_json_dict #judical assist pub informations def crawl_judical_assist_pub_pages(self, url, post_data={}): sub_json_dict = {} try: page = self.crawl_page_by_url_post( urls['host'] + "/judiciaryAssist/judiciaryAssistInit.html", post_data)['page'] xz = self.parse_page_2(page, 'guquandongjie', post_data) sub_json_dict['judical_assist_pub_equity_freeze'] = xz[ u'司法股权冻结信息'] if xz.has_key(u'司法股权冻结信息') else [] page = self.crawl_page_by_url_post( urls['host'] + "/sfGuQuanChange/guQuanChange.html", post_data)['page'] xz = self.parse_page_2(page, 'gudongbiangeng', post_data) sub_json_dict['judical_assist_pub_shareholder_modify'] = xz[ u'司法股东变更登记信息'] if xz.has_key(u'司法股东变更登记信息') else [] except Exception as e: logging.debug( u"An error ocurred in crawl_other_dept_pub_pages: %s" % (type(e))) raise e finally: return sub_json_dict pass def get_raw_text_by_tag(self, tag): return tag.get_text().strip() #获得表头 def get_table_title(self, table_tag): if table_tag.find('tr'): if table_tag.find('tr').find_all('th'): if len(table_tag.find('tr').find_all('th')) > 1: return None # 处理 <th> aa<span> bb</span> </th> if table_tag.find('tr').th.stirng == None and len( table_tag.find('tr').th.contents) > 1: # 处理 <th> <span> bb</span> </th> 包含空格的 if (table_tag.find('tr').th.contents[0]).strip(): return (table_tag.find('tr').th.contents[0]) # <th><span> bb</span> </th> return self.get_raw_text_by_tag(table_tag.find('tr').th) elif table_tag.find('tr').find('td'): return self.get_raw_text_by_tag(table_tag.find('tr').td) return None def sub_column_count(self, th_tag): if th_tag.has_attr('colspan') and th_tag.get('colspan') > 1: return int(th_tag.get('colspan')) return 0 def get_sub_columns(self, tr_tag, index, count): columns = [] for i in range(index, index + count): th = tr_tag.find_all('th')[i] if not self.sub_column_count(th): columns.append((self.get_raw_text_by_tag(th), self.get_raw_text_by_tag(th))) else: #if has sub-sub columns columns.append( (self.get_raw_text_by_tag(th), self.get_sub_columns(tr_tag.nextSibling.nextSibling, 0, self.sub_column_count(th)))) return columns #get column data recursively, use recursive because there may be table in table def get_column_data(self, columns, td_tag): if type(columns) == list: data = {} multi_col_tag = td_tag if td_tag.find('table'): multi_col_tag = td_tag.find('table').find('tr') if not multi_col_tag: logging.error('invalid multi_col_tag, multi_col_tag = %s', multi_col_tag) return data if len(columns) != len( multi_col_tag.find_all('td', recursive=False)): logging.error( 'column head size != column data size, columns head = %s, columns data = %s' % (columns, multi_col_tag.contents)) return data for id, col in enumerate(columns): data[col[0]] = self.get_column_data( col[1], multi_col_tag.find_all('td', recursive=False)[id]) return data else: return self.get_raw_text_by_tag(td_tag) def get_detail_link(self, bs4_tag): if bs4_tag['href'] and bs4_tag['href'] != '#': pattern = re.compile(r'http') if pattern.search(bs4_tag['href']): return bs4_tag['href'] return urls['prefix_url'] + bs4_tag['href'] elif bs4_tag['onclick']: return self.get_detail_link_onclick(bs4_tag) def get_detail_link_onclick(self, bs4_tag): re1 = '.*?' # Non-greedy match on filler re2 = '(\\\'.*?\\\')' # Single Quote String 1 rg = re.compile(re1 + re2, re.IGNORECASE | re.DOTALL) m = rg.search(bs4_tag['onclick']) url = "" if m: strng1 = m.group(1) url = strng1.strip("\'") return url def get_columns_of_record_table(self, bs_table, page, table_name): tbody = None if len(bs_table.find_all('tbody')) > 1: tbody = bs_table.find_all('tbody')[1] else: tbody = bs_table.find('tbody') or BeautifulSoup( page, 'html5lib').find('tbody') tr = None if tbody: if len(tbody.find_all('tr')) <= 1: #tr = tbody.find('tr') tr = None else: tr = tbody.find_all('tr')[1] if not tr.find('th'): tr = tbody.find_all('tr')[0] elif tr.find('td'): tr = None else: if len(bs_table.find_all('tr')) <= 1: return None elif bs_table.find_all('tr')[0].find( 'th' ) and not bs_table.find_all('tr')[0].find('td') and len( bs_table.find_all('tr')[0].find_all('th')) > 1: tr = bs_table.find_all('tr')[0] elif bs_table.find_all('tr')[1].find( 'th' ) and not bs_table.find_all('tr')[1].find('td') and len( bs_table.find_all('tr')[1].find_all('th')) > 1: tr = bs_table.find_all('tr')[1] ret_val = self.get_record_table_columns_by_tr(tr, table_name) return ret_val def get_record_table_columns_by_tr(self, tr_tag, table_name): columns = [] if not tr_tag: return columns try: sub_col_index = 0 if len(tr_tag.find_all('th')) == 0: logging.error(u"The table %s has no columns" % table_name) return columns #排除仅仅出现一列重复的名字 count = 0 for i, th in enumerate(tr_tag.find_all('th')): col_name = self.get_raw_text_by_tag(th) #if col_name and ((col_name, col_name) not in columns) : if col_name: if ((col_name, col_name) in columns): col_name = col_name + '_' count += 1 if not self.sub_column_count(th): columns.append((col_name, col_name)) else: #has sub_columns columns.append((col_name, self.get_sub_columns( tr_tag.nextSibling.nextSibling, sub_col_index, self.sub_column_count(th)))) sub_col_index += self.sub_column_count(th) if count == len(tr_tag.find_all('th')) / 2: columns = columns[:len(columns) / 2] except Exception as e: logging.error( u'exception occured in get_table_columns, except_type = %s, table_name = %s' % (type(e), table_name)) finally: return columns # 如果是第二种: http://gsxt.gdgs.gov.cn/aiccips/ q情况 def parse_ent_pub_annual_report_page_2(self, base_page, page_type): page_data = {} soup = BeautifulSoup(base_page, 'html5lib') if soup.body.find('table'): try: base_table = soup.body.find('table') table_name = u'企业基本信息' #self.get_table_title(base_table) #这里需要连续两个nextSibling,一个nextSibling会返回空 detail_base_table = base_table.nextSibling.nextSibling if detail_base_table.name == 'table': page_data[table_name] = self.parse_table_2( detail_base_table) pass else: logging.error( u"Can't find details of base informations for annual report" ) except Exception as e: logging.error(u"fail to get table name with exception %s" % (type(e))) try: table = detail_base_table.nextSibling.nextSibling while table: if table.name == 'table': table_name = self.get_table_title(table) page_data[table_name] = [] columns = self.get_columns_of_record_table( table, base_page, table_name) page_data[table_name] = self.parse_table_2( table, columns, {}, table_name) table = table.nextSibling except Exception as e: logging.error( u"fail to parse the rest tables with exception %s" % (type(e))) else: pass return page_data def get_particular_table(self, table, page): """ 获取 股东及出资信息的表格,按照指定格式输出 """ table_dict = {} sub_dict = {} table_list = [] try: trs = table.find_all('tr') for tr in trs: if tr.find('td'): tds = tr.find_all('td') if len(tds) <= 1: continue table_dict[u'股东'] = self.get_raw_text_by_tag(tds[0]) table_dict[u'股东类型'] = self.get_raw_text_by_tag(tds[1]) sub_dict = {} sub_dict[u'认缴出资额(万元)'] = self.get_raw_text_by_tag(tds[2]) sub_dict[u'认缴出资方式'] = self.get_raw_text_by_tag(tds[3]) sub_dict[u'认缴出资日期'] = self.get_raw_text_by_tag(tds[4]) table_dict['认缴明细'] = sub_dict sub_dict = {} sub_dict[u'实缴出资额(万元)'] = self.get_raw_text_by_tag(tds[5]) sub_dict[u'实缴出资方式'] = self.get_raw_text_by_tag(tds[6]) sub_dict[u'实缴出资时间'] = self.get_raw_text_by_tag(tds[7]) table_dict['实缴明细'] = sub_dict table_dict['实缴额(万元)'] = self.get_raw_text_by_tag(tds[5]) table_dict['认缴额(万元)'] = self.get_raw_text_by_tag(tds[2]) table_list.append(table_dict) except Exception as e: logging.error(u'parse 股东及出资信息 table failed! : %s' % e) return table_list def parse_page(self, page, div_id='jibenxinxi'): soup = BeautifulSoup(page, 'html5lib') page_data = {} try: div = soup.find('div', attrs={'id': div_id}) if div: tables = div.find_all('table') else: tables = soup.find_all('table') #print table for table in tables: table_name = self.get_table_title(table) if table_name: if table_name == u"股东及出资信息": page_data[table_name] = self.get_particular_table( table, page) else: page_data[table_name] = self.parse_table( table, table_name, page) except Exception as e: logging.error(u'parse page failed, with exception %s' % e) raise e finally: return page_data def parse_page_2(self, page, div_id, post_data={}): soup = BeautifulSoup(page, 'html5lib') page_data = {} if soup.body: if soup.body.table: try: divs = soup.body.find('div', {"id": div_id}) table = None if not divs: table = soup.body.find('table') else: table = divs.find('table') #print table table_name = "" columns = [] while table: if table.name == 'table': table_name = self.get_table_title(table) if table_name is None: table_name = div_id page_data[table_name] = [] columns = self.get_columns_of_record_table( table, page, table_name) result = self.parse_table_2( table, columns, post_data, table_name) if not columns and not result: del page_data[table_name] else: page_data[table_name] = result elif table.name == 'div': if not columns: logging.error( u"Can not find columns when parsing page 2, table :%s" % div_id) break page_data[table_name] = self.parse_table_2( table, columns, post_data, table_name) columns = [] table = table.nextSibling except Exception as e: logging.error(u'parse failed, with exception %s' % e) raise e finally: pass return page_data def parse_table_2(self, bs_table, columns=[], post_data={}, table_name=""): table_dict = None try: tbody = bs_table.find('tbody') or BeautifulSoup( page, 'html5lib').find('tbody') if columns: col_span = 0 for col in columns: if type(col[1]) == list: col_span += len(col[1]) else: col_span += 1 column_size = len(columns) item_array = [] # <div> <table>数据</table><table>下一页</table> </div> tables = bs_table.find_all('table') if len(tables) == 2 and tables[1].find('a'): # 获取下一页的url clickstr = tables[1].find('a')['onclick'] re1 = '.*?' # Non-greedy match on filler re2 = '\\\'.*?\\\'' # Uninteresting: strng re3 = '.*?' # Non-greedy match on filler re4 = '(\\\'.*?\\\')' # Single Quote String 1 re5 = '.*?' # Non-greedy match on filler re6 = '(\\\'.*?\\\')' # Single Quote String 2 rg = re.compile(re1 + re2 + re3 + re4 + re5 + re6, re.IGNORECASE | re.DOTALL) m = rg.search(clickstr) url = "" if m: string1 = m.group(1) string2 = m.group(2) url = string1.strip('\'') + string2.strip('\'') logging.debug(u"url = %s\n" % url) data = { "pageNo": 2, "entNo": post_data["entNo"].encode('utf-8'), "regOrg": post_data["regOrg"], "entType": post_data["entType"].encode('utf-8'), } res = self.crawl_page_by_url_post(url, data) #print res['page'] if table_name == u"变更信息": # chaToPage d = json.loads(res['page']) titles = [column[0] for column in columns] for i, model in enumerate(d['list']): data = [ model['altFiledName'], model['altBe'], model['altAf'], model['altDate'] ] item_array.append(dict(zip(titles, data))) elif table_name == u"主要人员信息": # vipToPage d = json.loads(res['page'], encoding="utf-8") titles = [column[0] for column in columns] for i, model in enumerate(d['list']): data = [i + 1, model['name'], model['position']] item_array.append(dict(zip(titles, data))) elif table_name == u"分支机构信息": #braToPage #print u"分支机构" d = json.loads(res['page']) titles = [column[0] for column in columns] for i, model in enumerate(d['list']): data = [ i + 1, model['regNO'], model['brName'].encode('utf8').decode('utf8'), model['regOrg'].encode('utf8') ] item_array.append(dict(zip(titles, data))) elif table_name == u"股东信息": #print "股东信息" d = json.loads(res['page']) titles = [column[0] for column in columns] for i, model in enumerate(d['list']): data = [ model['invType'], model['inv'], model['certName'], mode['certNo'] ] item_array.append(dict(zip(titles, data))) pass table_dict = item_array else: if not tbody: records_tag = tables[0] else: records_tag = tbody for tr in records_tag.find_all('tr'): if tr.find('td') and len( tr.find_all( 'td', recursive=False)) % column_size == 0: col_count = 0 item = {} print "table_name=%s" % table_name for td in tr.find_all('td', recursive=False): if td.find('a'): next_url = self.get_detail_link( td.find('a')) print next_url if re.match(r"http", next_url): detail_page = self.crawl_page_by_url( next_url) #html_to_file("next.html", detail_page['page']) if table_name == u'qiyenianbao': print "in table_name" page_data = self.parse_ent_pub_annual_report_page_2( detail_page['page'], table_name + '_detail') item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) item[u'详情'] = page_data else: page_data = self.parse_page( detail_page['page'], table_name + '_detail') item[columns[col_count][ 0]] = page_data #this may be a detail page data else: item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) else: item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) col_count += 1 if col_count == column_size: item_array.append(item.copy()) col_count = 0 #this case is for the ind-comm-pub-reg-shareholders----details'table elif tr.find('td') and len( tr.find_all('td', recursive=False) ) == col_span and col_span != column_size: col_count = 0 sub_col_index = 0 item = {} sub_item = {} for td in tr.find_all('td', recursive=False): if td.find('a'): #try to retrieve detail link from page next_url = self.get_detail_link( td.find('a')) #has detail link if next_url: detail_page = self.crawl_page_by_url( next_url)['page'] if table_name == 'qiyenianbao': page_data = self.parse_ent_pub_annual_report_page_2( detail_page['page'], table_name + '_detail') item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) item[u'详情'] = page_data else: page_data = self.parse_page( detail_page['page'], table_name + '_detail') item[columns[col_count][ 0]] = page_data #this may be a detail page data else: item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) else: if type(columns[col_count][1]) == list: sub_key = columns[col_count][1][ sub_col_index][1] sub_item[ sub_key] = self.get_raw_text_by_tag( td) sub_col_index += 1 if sub_col_index == len( columns[col_count][1]): item[columns[col_count] [0]] = sub_item.copy() sub_item = {} col_count += 1 sub_col_index = 0 else: item[columns[col_count] [0]] = self.get_column_data( columns[col_count][1], td) col_count += 1 if col_count == column_size: item_array.append(item.copy()) col_count = 0 table_dict = item_array else: table_dict = {} for tr in bs_table.find_all('tr'): if tr.find('th') and tr.find('td'): ths = tr.find_all('th') tds = tr.find_all('td') if len(ths) != len(tds): logging.error( u'th size not equals td size in table %s, what\'s up??' % table_name) return else: for i in range(len(ths)): if self.get_raw_text_by_tag(ths[i]): table_dict[self.get_raw_text_by_tag( ths[i])] = self.get_raw_text_by_tag( tds[i]) except Exception as e: logging.error(u'parse table %s failed with exception %s' % (table_name, type(e))) raise e finally: return table_dict def crawl_page_by_url(self, url, header={}): self.requests.headers.update(header) r = self.requests.get(url) if r.status_code != 200: logging.error(u"Getting page by url:%s\n, return status %s\n" % (url, r.status_code)) # 为了防止页面间接跳转,获取最终目标url return {'page': r.text, 'url': r.url} def crawl_page_by_url_post(self, url, datas, header={}): self.requests.headers.update(header) r = self.requests.post(url, data=datas) if r.status_code != 200: logging.error( u"Getting page by url with post:%s\n, return status %s\n" % (url, r.status_code)) return {'page': r.text, 'url': r.url} # main function def run(self, ent_num): if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) json_dict = {} self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num) data = self.crawl_page_main() json_dict[ent_num] = data #json_dump_to_file(self.json_restore_path , json_dict) # 2016-2-16 return json.dumps(json_dict) def work(self, ent_num): # if not os.path.exists(self.html_restore_path): # os.makedirs(self.html_restore_path) self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num) data = self.crawl_page_main() json_dump_to_file('hainan.json', data)
class ChongqingClawer(Crawler): """重庆工商公示信息网页爬虫 """ # 多线程爬取时往最后的json文件中写时的加锁保护 urls = { 'host': 'http://gsxt.cqgs.gov.cn', 'get_checkcode': 'http://gsxt.cqgs.gov.cn/sc.action?width=130&height=40', 'repost_checkcode': 'http://gsxt.cqgs.gov.cn/search_research.action', # 获得查询页面 'post_checkcode': 'http://gsxt.cqgs.gov.cn/search.action', # 根据查询页面获得指定公司的数据 'search_ent': 'http://gsxt.cqgs.gov.cn/search_getEnt.action', # 年报 'year_report': 'http://gsxt.cqgs.gov.cn/search_getYearReport.action', # 年报详情 'year_report_detail': 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action', # 股权变更 'year_daily_transinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 股东出资信息 'year_daily_invsub': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 行政处罚 'year_daily_peninfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 行政许可 'year_daily_licinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 知识产权出质登记 'year_daily_pleinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action', # 其他行政许可信息 'other_qlicinfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action', # 其他行政处罚 'other_qpeninfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action', # 股权冻结信息 'sfxz_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZ.action', # 股东变更信息 'sfxzgdbg_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZGDBG.action', } write_file_mutex = threading.Lock() def __init__(self, json_restore_path): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ # json 数据集 # POST self.json_restore_path = json_restore_path if os.path.exists(self.json_restore_path) is False: os.makedirs(self.json_restore_path, 0775) self.parser = ChongqingParser(self) self.credit_ticket = None #html数据的存储路径 self.html_restore_path = os.path.join(self.json_restore_path, "/chongqing/") if os.path.exists(self.html_restore_path) is False: os.makedirs(self.html_restore_path, 0775) #验证码图片的存储路径 self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg') self.code_cracker = CaptchaRecognition("chongqing") self.ent_number = None # GET self.ckcode = None self.json_ent_info = None self.json_sfxzgdbg = None self.json_sfxz = None self.json_other_qlicinfo = None self.json_other_qpeninfo = None self.json_year_report = None self.json_year_report_detail = None self.json_year_daily_transinfo = None self.json_year_daily_invsub = None self.json_year_daily_peninfo = None self.json_year_daily_licinfo = None self.json_year_daily_pleinfo = None self.json_dict = {} self.json_restore_path = json_restore_path self.parser = ChongqingParser(self) self.reqst = requests.Session() self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'}) def run(self, ent_number=0): self.ent_number = str(ent_number) page = self.crawl_check_page() try: self.crawl_page_jsons(page) self.parser.parse_jsons() self.parser.merge_jsons() except Exception as e: # logging.error('error') return None return json.dumps({self.ent_number: self.json_dict}) def crawl_check_page(self): """爬取验证码页面,包括下载验证码图片以及破解验证码 :return true or false """ count = 0 while count < 30: ck_code = self.crack_check_code() data = {'key':self.ent_number,'code':ck_code} resp = self.reqst.post(ChongqingClawer.urls['post_checkcode'], data=data) if resp.status_code != 200: logging.error("crawl post check page failed!") count += 1 continue return resp.content return None def crack_check_code(self): """破解验证码 :return 破解后的验证码 """ resp = self.reqst.get(ChongqingClawer.urls['get_checkcode']) if resp.status_code != 200: logging.error('failed to get get_checkcode') return None time.sleep(random.uniform(0.1, 0.2)) self.write_file_mutex.acquire() with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) # ckcode = self.code_cracker.predict_result(self.ckcode_image_dir_path + 'image' + str(i) + '.jpg') except Exception as e: logging.warn('exception occured when crack checkcode') ckcode = ('', '') finally: pass self.write_file_mutex.release() return ckcode[1] def crack_checkcode(self): """破解验证码 :return 破解后的验证码 """ resp = self.reqst.get(ChongqingClawer.urls['get_checkcode']) if resp.status_code != 200: logging.error('failed to get get_checkcode') print 'error' return None time.sleep(random.uniform(2, 4)) self.write_file_mutex.acquire() self.ckcode_image_path = settings.json_restore_path + '/chongqing/ckcode.jpg' with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) self.write_file_mutex.release() try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.warn('exception occured when crack checkcode') ckcode = ('', '') finally: pass return ckcode[1] def crawl_page_jsons(self,page): """获取所有界面的json数据""" data = self.parser.parse_search_results_pages(page) if data is not None: self.crawl_ent_info_json(data) self.crawl_year_report_json(data) self.crawl_year_report_detail_json(data) # print(self.json_year_report_detail) # time.sleep(0.1) self.crawl_sfxzgdbg_json(data) # print(self.json_sfxzgdbg) # time.sleep(0.1) self.crawl_sfxz_json(data) # print(self.json_sfxz) # time.sleep(0.1) self.crawl_year_daily_invsub_json(data) # print(self.json_year_daily_invsub) # time.sleep(0.1) self.crawl_year_daily_licinfo_json(data) # print(self.json_year_daily_licinfo) # time.sleep(0.1) self.crawl_year_daily_peninfo_json(data) # print(self.json_year_daily_peninfo) # time.sleep(0.1) self.crawl_year_daily_transinfo_json(data) # print(self.json_year_daily_transinfo) # time.sleep(0.1) self.crawl_year_daily_pleinfo_json(data) # print(self.json_year_daily_pleinfo) # time.sleep(0.1) self.crawl_other_qpeninfo_json(data) # print(self.json_other_qpeninfo) # time.sleep(0.1) self.crawl_other_qlicinfo_json(data) # print(self.json_other_qlicinfo) else: print('error') def crawl_ent_info_json(self, data, type=1): """企业详细信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': type} json_data = self.reqst.get(ChongqingClawer.urls['search_ent'], params=params) if json_data.status_code == 200: json_data = json_data.content json_data = str(json_data) self.json_ent_info = json_data[6:] # 去掉数据中的前六个字符保证数据为完整json格式数据 if self.json_ent_info is None or 'base' not in self.json_ent_info: self.crawl_ent_info_json(data, type=10) # 有些公司需要传过去的参数为 10 # print(self.json_ent_info) def crawl_year_report_json(self, data): """年报数据""" params = {'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingClawer.urls['year_report'], params=params) while json_data.status_code != 200: json_data = self.reqst.get(ChongqingClawer.urls['year_report'], params=params) json_data = json_data.content json_data = str(json_data) self.json_year_report = json_data[6:] # 去掉数据中的前六个字符保证数据为完整json格式数据 # print(self.json_year_report) def crawl_year_report_detail_json(self, data): """详细年报""" # TO DO 需要获得 year_report 中的年份信息 while self.json_year_report is None: self.crawl_year_report_json(data) year_report = json.loads(self.json_year_report, encoding='utf-8') histories = year_report.get('history') for i in range(len(histories)): year = histories[i].get('year') params = {'id': data.get('id'), 'type': 1, 'year': str(year)} json_data = self.reqst.get(ChongqingClawer.urls['year_report_detail'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data self.json_year_report_detail = str(json_data.content) # print(self.json_year_report_detail) def crawl_year_daily_transinfo_json(self, data): """股权变更""" params = {'id': data.get('id'), 'jtype': 'transinfo'} json_data = self.reqst.get(ChongqingClawer.urls['year_daily_transinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_transinfo = json_data[6:] # print(self.json_year_daily_transinfo) def crawl_year_daily_pleinfo_json(self, data): """行政许可""" params = {'id': data.get('id'), 'jtype': 'pleinfo'} json_data = self.reqst.get(ChongqingClawer.urls['year_daily_pleinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_pleinfo = json_data[6:] # print(self.json_year_daily_pleinfo) def crawl_year_daily_invsub_json(self, data): """股东出资信息""" params = {'id': data.get('id'), 'jtype': 'invsub'} json_data = self.reqst.get(ChongqingClawer.urls['year_daily_invsub'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_invsub = json_data[6:] # print(self.json_year_daily_invsub) def crawl_year_daily_licinfo_json(self, data): """行政许可""" params = {'id': data.get('id'), 'jtype': 'licinfo'} json_data = self.reqst.get(ChongqingClawer.urls['year_daily_licinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_licinfo = json_data[6:] # print(self.json_year_daily_licinfo) def crawl_year_daily_peninfo_json(self, data): """行政处罚""" params = {'id': data.get('id'), 'jtype': 'peninfo'} json_data = self.reqst.get(ChongqingClawer.urls['year_daily_peninfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_year_daily_peninfo = json_data[6:] # print(self.json_year_daily_peninfo) def crawl_sfxzgdbg_json(self, data): """股东变更信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingClawer.urls['sfxzgdbg_page'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_sfxzgdbg = json_data[6:] # print(self.json_sfxzgdbg) def crawl_sfxz_json(self, data): """股权冻结信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1} json_data = self.reqst.get(ChongqingClawer.urls['sfxz_page'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_sfxz = json_data[6:] # print(self.json_sfxz) def crawl_other_qlicinfo_json(self, data): """股东出资信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qlicinfo', 'type': 1} json_data = self.reqst.get(ChongqingClawer.urls['other_qlicinfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_other_qlicinfo = json_data[6:] # print(self.json_other_qlicinfo) def crawl_other_qpeninfo_json(self, data): """股东出资信息""" params = {'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qpeninfo', 'type': 1} json_data = self.reqst.get(ChongqingClawer.urls['other_qpeninfo'], params=params) if json_data.status_code == 200: # 此页面响应结果直接就是 json_data json_data = json_data.content json_data = str(json_data) self.json_other_qpeninfo = json_data[6:]
class BeijingCrawler(Crawler): """北京工商爬虫 """ code_cracker = CaptchaRecognition('beijing') #多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = { 'host': 'http://qyxy.baic.gov.cn', 'official_site': 'http://qyxy.baic.gov.cn/beijing', 'get_checkcode': 'http://qyxy.baic.gov.cn', 'post_checkcode': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!checkCode.dhtml', 'open_info_entry': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!getBjQyList.dhtml', 'ind_comm_pub_reg_basic': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!openEntInfo.dhtml?', 'ind_comm_pub_reg_shareholder': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!tzrFrame.dhtml?', 'ind_comm_pub_reg_modify': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!biangengFrame.dhtml?', 'ind_comm_pub_arch_key_persons': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!zyryFrame.dhtml?', 'ind_comm_pub_arch_branch': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!fzjgFrame.dhtml?', 'ind_comm_pub_arch_liquidation': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!qsxxFrame.dhtml?', 'ind_comm_pub_movable_property_reg': 'http://qyxy.baic.gov.cn/gjjbjTab/gjjTabQueryCreditAction!dcdyFrame.dhtml?', 'ind_comm_pub_equity_ownership_reg': 'http://qyxy.baic.gov.cn/gdczdj/gdczdjAction!gdczdjFrame.dhtml?', 'ind_comm_pub_administration_sanction': 'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list.dhtml?', 'ind_comm_pub_business_exception': 'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_jyycxx.dhtml?', 'ind_comm_pub_serious_violate_law': 'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_yzwfxx.dhtml?', 'ind_comm_pub_spot_check': 'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_ccjcxx.dhtml?', 'ent_pub_ent_annual_report': 'http://qyxy.baic.gov.cn/qynb/entinfoAction!qyxx.dhtml?', 'ent_pub_shareholder_capital_contribution': 'http://qyxy.baic.gov.cn/gdcz/gdczAction!list_index.dhtml?', 'ent_pub_equity_change': 'http://qyxy.baic.gov.cn/gdgq/gdgqAction!gdgqzrxxFrame.dhtml?', 'ent_pub_administration_license': 'http://qyxy.baic.gov.cn/xzxk/xzxkAction!list_index.dhtml?', 'ent_pub_knowledge_property': 'http://qyxy.baic.gov.cn/zscqczdj/zscqczdjAction!list_index.dhtml?', 'ent_pub_administration_sanction': 'http://qyxy.baic.gov.cn/gdgq/gdgqAction!qyxzcfFrame.dhtml?', 'other_dept_pub_administration_license': 'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzxk.dhtml?', 'other_dept_pub_administration_sanction': 'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzcf.dhtml?', 'shareholder_detail': 'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!touzirenInfo.dhtml?' } def __init__(self, json_restore_path=None): self.json_restore_path = json_restore_path #html数据的存储路径 html_restore_path = self.json_restore_path + '/beijing/' #验证码图片的存储路径 ckcode_image_path = self.json_restore_path + '/beijing/ckcode.jpg' self.parser = BeijingParser(self) self.credit_ticket = None if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) self.timeout = 20 def run(self, ent_number): """爬取的主函数 """ self.ent_id = '' return Crawler.run(self, ent_number) def crawl_page_by_url(self, url): resp = None try: resp = self.reqst.get(url, timeout=self.timeout, proxies=self.proxies) except requests.exceptions.ConnectionError: self.proxies = Proxies().get_proxies() logging.error("get method self.proxies changed proxies = %s\n" % (self.proxies)) return self.crawl_page_by_url(url) except requests.exceptions.Timeout: self.timeout += 5 logging.error( "get method self.timeout plus timeout = %d, proxies= %s\n" % (self.timeout, self.proxies)) return self.crawl_page_by_url(url) except Exception as e: logging.error("Other exception occured!type e = %s, proxies=%s\n" % (type(e), self.proxies)) return resp def crawl_page_by_url_post(self, url, data): resp = None try: resp = self.reqst.post(url, data, timeout=self.timeout, proxies=self.proxies) except requests.exceptions.ConnectionError: self.proxies = Proxies().get_proxies() logging.error("post method self.proxies changed. proxies = %s\n" % (self.proxies)) return self.crawl_page_by_url_post(url, data) except requests.exceptions.Timeout: self.timeout += 5 logging.error( "post method self.timeout plus, timeout= %d, proxies= %s\n" % (self.timeout, self.proxies)) return self.crawl_page_by_url_post(url, data) except Exception as e: logging.error( "Other exception occured!type e = %s, proxies=%s \n" % (type(e), self.proxies)) return resp def crawl_check_page(self): """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交 """ resp = self.crawl_page_by_url(self.urls['official_site']) if resp.status_code != 200: logging.error('failed to get official site page!') return False count = 0 while count < 15: count += 1 ckcode = self.crack_checkcode() if not ckcode[1]: logging.error( 'failed to get crackcode result, fail count = %d' % (count)) continue post_data = { 'currentTimeMillis': self.time_stamp, 'credit_ticket': self.credit_ticket, 'checkcode': ckcode[1], 'keyword': self.ent_number } next_url = self.urls['post_checkcode'] resp = self.crawl_page_by_url_post(next_url, data=post_data) if resp.status_code != 200: logging.error( 'failed to get crackcode image by url %s, fail count = %d' % (next_url, count)) continue logging.error('crack code = %s, %s, response = %s' % (ckcode[0], ckcode[1], resp.content)) if resp.content == 'fail': logging.error( 'crack checkcode failed, response content = failed, total fail count = %d' % count) time.sleep(random.uniform(0.1, 2)) continue next_url = self.urls['open_info_entry'] resp = self.crawl_page_by_url_post(next_url, data=post_data) if resp.status_code != 200: logging.error( 'failed to open info entry by url %s, fail count = %d' % (next_url, count)) continue crack_result = self.parse_post_check_page(resp.content) if crack_result: return True else: logging.error('crack checkcode failed, total fail count = %d' % count) time.sleep(random.uniform(3, 5)) return False def crawl_ind_comm_pub_pages(self): """爬取工商公示信息页面 """ for item in ( 'ind_comm_pub_reg_basic', # 登记信息-基本信息 'ind_comm_pub_reg_shareholder', # 股东信息 'ind_comm_pub_reg_modify', 'ind_comm_pub_arch_key_persons', # 备案信息-主要人员信息 'ind_comm_pub_arch_branch', # 备案信息-分支机构信息 'ind_comm_pub_arch_liquidation', # 备案信息-清算信息 'ind_comm_pub_movable_property_reg', # 动产抵押登记信息 'ind_comm_pub_equity_ownership_reg', # 股权出置登记信息 'ind_comm_pub_administration_sanction', # 行政处罚信息 'ind_comm_pub_business_exception', # 经营异常信息 'ind_comm_pub_serious_violate_law', # 严重违法信息 'ind_comm_pub_spot_check' # 抽查检查信息 ): self.get_page_json_data(item, 1) time.sleep(random.uniform(0, 3)) def crawl_ent_pub_pages(self): """爬取企业公示信息页面 """ for item in ( 'ent_pub_ent_annual_report', 'ent_pub_shareholder_capital_contribution', #企业投资人出资比例 'ent_pub_equity_change', #股权变更信息 'ent_pub_administration_license', #行政许可信息 'ent_pub_knowledge_property', #知识产权出资登记 'ent_pub_administration_sanction' #行政许可信息 ): self.get_page_json_data(item, 2) time.sleep(random.uniform(0, 3)) def crawl_other_dept_pub_pages(self): """爬取其他部门公示信息页面 """ for item in ( 'other_dept_pub_administration_license', #行政许可信息 'other_dept_pub_administration_sanction' #行政处罚信息 ): self.get_page_json_data(item, 3) def crawl_judical_assist_pub_pages(self): """爬取司法协助信息页面 """ pass def get_page_json_data(self, page_name, page_type): """获得页面的解析后的json格式数据 Args: page_name: 页面名称 page_type: 页面类型, 1 工商公示页面, 2 企业公示页面, 3 其他部门公示页面 """ page = self.get_page(page_name, page_type) pages = self.get_all_pages_of_a_section(page, page_name) if len(pages) == 1: self.json_dict[page_name] = {} json_data = self.parser.parse_page(page, page_name) if json_data: self.json_dict[page_name] = json_data else: self.json_dict[page_name] = [] for p in pages: json_data = self.parser.parse_page(p, page_name) if json_data: self.json_dict[page_name] += json_data def get_checkcode_url(self): count = 0 while count < 5: count += 1 resp = self.crawl_page_by_url(self.urls['official_site']) time.sleep(random.uniform(1, 5)) if resp.status_code != 200: logging.error('failed to get crackcode url') continue response = resp.content soup = BeautifulSoup(response, 'html.parser') ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src') ckimg_src = str(ckimg_src) re_checkcode_captcha = re.compile(r'/([\s\S]*)\?currentTimeMillis') # re_currenttime_millis=re.compile(r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)') checkcode_type = re_checkcode_captcha.findall(ckimg_src)[0] if checkcode_type == 'CheckCodeCaptcha': #parse the pre check page, get useful information self.parse_pre_check_page(response) checkcode_url = self.urls['get_checkcode'] + ckimg_src return checkcode_url # elif checkcode_type == 'CheckCodeYunSuan': logging.error( 'can not get CheckCodeCaptcha type of checkcode img, count times = %d \n' % (count)) return None def parse_post_check_page(self, page): """解析提交验证码之后的页面,获取必要的信息 """ if page == 'fail': logging.error('checkcode error!') # if senting_open: # senting_client.captureMessage('checkcode error!') return False soup = BeautifulSoup(page, 'html.parser') r = soup.find_all('a', { 'href': "#", 'onclick': re.compile(r'openEntInfo') }) ent = '' if r: ent = r[0]['onclick'] else: logging.error('fail to find openEntInfo') return False m = re.search(r'\'([\w]*)\'[ ,]+\'([\w]*)\'[ ,]+\'([\w]*)\'', ent) if m: self.ent_id = m.group(1) self.credit_ticket = m.group(3) r = soup.find_all( 'input', { 'type': "hidden", 'name': "currentTimeMillis", 'id': "currentTimeMillis" }) if r: self.time_stamp = r[0]['value'] else: logging.error('fail to get time stamp') return True def parse_pre_check_page(self, page): """解析提交验证码之前的页面 """ soup = BeautifulSoup(page, 'html.parser') ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src') ckimg_src = str(ckimg_src) re_currenttime_millis = re.compile( r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)') self.credit_ticket = soup.find_all('input', id='credit_ticket')[0].get('value') self.time_stamp = re_currenttime_millis.findall(ckimg_src)[0] # self.time_stamp = self.generate_time_stamp() """ def crawl_page_by_url(self, url): resp = self.crawl_page_by_url(url) if resp.status_code != 200: logging.error('failed to crawl page by url' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) # if saveingtml: # CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page) return page """ def get_all_pages_of_a_section(self, page, type, url=None): """获取页面上含有 上一页、下一页跳转链接的区域的所有的数据 Args: page: 已经爬取的页面 type: 页面类型 url: 该页面的url,默认为None,因为一般可以通过 type 从 BeijingCrawler.urls 中找到 Returns: pages: 所有页面的列表 """ if not page: return page soup = BeautifulSoup(page, 'html.parser') page_count = 0 page_size = 0 pages_data = [] pages_data.append(page) r1 = soup.find_all('input', {'type': 'hidden', 'id': 'pagescount'}) r2 = soup.find_all('input', { 'type': 'hidden', 'id': 'pageSize', 'name': 'pageSize' }) if r1 and r2: page_count = int(r1[0].get('value')) page_size = int(r2[0].get('value')) else: #只有一页 return pages_data if page_count <= 1: return pages_data if not url: next_url = self.urls[type].rstrip('?') else: next_url = url for p in range(1, page_count): post_data = { 'pageNos': str(p + 1), 'clear': '', 'pageNo': str(p), 'pageSize': str(page_size), 'ent_id': self.ent_id } try: resp = self.crawl_page_by_url_post(next_url, data=post_data) if resp.status_code != 200: logging.error('failed to get all page of a section') return pages_data page = resp.content time.sleep(random.uniform(0.2, 1)) except Exception as e: logging.error( 'open new tab page failed, url = %s, page_num = %d' % (next_url, p + 1)) page = None raise e finally: if page: pages_data.append(page) return pages_data def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) logging.error('get %s, url:\n%s\n' % (type, url)) resp = self.crawl_page_by_url(url) if resp.status_code != 200: logging.error('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) return page def crack_checkcode(self): """破解验证码""" ckcode = ('', '') checkcode_url = self.get_checkcode_url() if checkcode_url == None: return ckcode resp = self.crawl_page_by_url(checkcode_url) if resp.status_code != 200: logging.error('failed to get checkcode img') return ckcode page = resp.content time.sleep(random.uniform(1, 2)) self.write_file_mutex.acquire() with open(self.ckcode_image_path, 'wb') as f: f.write(page) if not self.code_cracker: logging.error('invalid code cracker\n') return ckcode try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.error('exception occured when crack checkcode') ckcode = ('', '') finally: pass self.write_file_mutex.release() return ckcode def generate_time_stamp(self): """生成时间戳 """ return int(time.time())
class SichuanCrawler(object): """ 四川爬虫, 继承object, 验证码与陕西一致。""" write_file_mutex = threading.Lock() def __init__(self, json_restore_path=None): self.pripid = None self.cur_time = str(int(time.time() * 1000)) self.reqst = requests.Session() self.reqst.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.reqst.mount('http://', adapter) self.json_restore_path = json_restore_path self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/sichuan/' self.code_cracker = CaptchaRecognition('sichuan') self.result_json_dict = {} self.json_list = [] proxies = get_proxy('shaanxi') if proxies: print proxies self.reqst.proxies = proxies self.timeout = (30, 20) self.ents = {} self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=', 'searchList': 'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=', 'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm' } self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check'} self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction'} self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'} def get_check_num(self): # print self.mydict['search']+self.cur_time resp = self.reqst.get(self.mydict['search'] + self.cur_time, timeout=self.timeout) if resp.status_code != 200: # print resp.status_code return None # print BeautifulSoup(resp.content).prettify resp = self.reqst.get(self.mydict['validateCode'] + '&dt=%s&random=%s' % (self.cur_time, self.cur_time), timeout=self.timeout) if resp.status_code != 200: # print 'no validateCode' return None with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) ck_code = self.code_cracker.predict_result(self.ckcode_image_path) if ck_code is None: return None else: return ck_code[1] def analyze_showInfo(self, page): soup = BeautifulSoup(page, 'html.parser') divs = soup.find_all('div', attrs={ "style": "width:950px; padding:25px 20px 0px; overflow: hidden;float: left;" }) if divs: try: Ent = {} count = 0 for div in divs: count += 1 link = div.find('li') url = "" if link and link.find('a') and link.find('a').has_attr('onclick'): url = link.find('a')['onclick'] ent = "" profile = link.find_next_sibling() if profile and profile.span: ent = profile.span.get_text().strip() name = link.find('a').get_text().strip() if self.ent_num == name or self.ent_num == ent: Ent.clear() Ent[ent] = url break if count == 3: break if not Ent: return False self.ents = Ent return True except: logging.error(u"%s" % (traceback.format_exc(10))) return False def get_id_num(self, findCode): count = 0 while count < 20: yzm = self.get_check_num() print yzm count += 1 if yzm is None: continue data = {'currentPageNo': '1', 'yzm': yzm, 'cxym': "cxlist", 'maent.entname': findCode} resp = self.reqst.post(self.mydict['searchList'] + self.cur_time, data=data, timeout=self.timeout) if self.analyze_showInfo(resp.content): return True print "crawl %s times:%d" % (findCode, count) time.sleep(random.uniform(1, 4)) return False def help_dcdy_get_dict(self, method, maent_pripid, maent_xh, random): data = {'method': method, 'maent.pripid': maent_pripid, 'maent.xh': maent_xh, 'random': random} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) needdict = {} for table in BeautifulSoup(resp.content, 'html.parser').find_all('table'): dcdy_head, dcdy_allths, dcdy_alltds = self.get_head_ths_tds(table) needdict[dcdy_head] = self.get_one_to_one_dict(dcdy_allths, dcdy_alltds) return needdict def help_enter_get_dict(self, method, maent_pripid, year, random): data = {'method': method, 'maent.pripid': maent_pripid, 'maent.nd': year, 'random': random} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) #print resp.status_code #print BeautifulSoup(resp.content).prettify needdict = {} for i, table in enumerate(BeautifulSoup(resp.content, 'html.parser').find_all('table')): enter_head, enter_allths, enter_alltds = self.get_head_ths_tds(table) if i == 0: try: enter_head = enter_allths[0] enter_allths = enter_allths[1:] except: enter_head = u'企业基本信息' enter_allths = [u'注册号/统一社会信用代码', u'企业名称', u'企业联系电话', u'邮政编码', \ u'企业通信地址', u'企业电子邮箱', u'有限责任公司本年度是否发生股东股权转让', u'企业经营状态', \ u'是否有网站或网店', u'是否有投资信息或购买其他公司股权', u'从业人数'] if enter_head == u'股东及出资信息': enter_allths = [u'股东', u'认缴出资额(万元)', u'认缴出资时间', u'认缴出资方式', u'实缴出资额(万元)', u'出资时间', u'出资方式'] #self.test_print_all_ths_tds(enter_head, enter_allths, enter_alltds) needdict[enter_head] = self.get_one_to_one_dict(enter_allths, enter_alltds) if enter_head == u'企业基本信息' or enter_head == u'企业资产状况信息': needdict[enter_head] = self.get_one_to_one_dict(enter_allths, enter_alltds)[0] return needdict def help_detail_get_dict(self, method, maent_xh, maent_pripid, random): data = {'method': method, 'maent.xh': maent_xh, 'maent.pripid': maent_pripid, 'random': random} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) # print resp.status_code # print BeautifulSoup(resp.content).prettify for table in BeautifulSoup(resp.content, 'html.parser').find_all('table'): if table.find_all('th') and table.find_all('th')[0].get_text().strip() == u'股东及出资信息': #print table detail_head, detail_allths, detail_alltds = self.get_head_ths_tds(table) # self.test_print_all_ths_tds(detail_head, detail_allths, detail_alltds) tempdict = {} for key, value in zip(detail_allths[:3], detail_alltds[:3]): tempdict[key] = value onelist_dict = {} for key, value in zip(detail_allths[3:], detail_alltds[3:]): onelist_dict[key] = value.split('\n')[-1] if value else None tempdict['list'] = [onelist_dict] return {u'股东及出资信息': [tempdict]} break def get_head_ths_tds(self, table): # print table try: head = table.find_all('th')[0].get_text().strip().split('\n')[0].strip() except: head = None pass allths = [th.get_text().strip() for th in table.find_all('th')[1:] if th.get_text()] for i, th in enumerate(allths): if th[:2] == '<<' or th[-2:] == '>>': allths = allths[:i] break alltds = [td.get_text().strip() if td.get_text() else None for td in table.find_all('td')] if head == u'变更信息' or head == u'修改记录' or head == u'行政处罚信息': alltds = [] for td in table.find_all('td'): if td.get_text(): if len(td.find_all('span')) > 1: alltds.append(td.find_all('span')[1].get_text().strip().split('\n')[0].strip()) else: alltds.append(td.get_text().strip()) else: alltds.append(None) if head == u'主要人员信息': allths = allths[:int(len(allths) / 2)] if head == u'股东及出资信息': allths = allths[:3] + allths[5:] if head == u'股东信息': alltds = [] for td in table.find_all('td'): if td.find('a'): onclick = td.a['onclick'] m = re.search(r"showRyxx\(\'(\w+?)\',\'(\w+?)\'\)", onclick) if m: maent_xh = m.group(1) maent_pripid = m.group(2) #print 'maent_xh',':', maent_xh,'maent_pripid',':',maent_pripid #print self.help_detail_get_dict('tzrCzxxDetial',maent_xh, maent_pripid, self.cur_time) alltds.append(self.help_detail_get_dict('tzrCzxxDetial', maent_xh, maent_pripid, self.cur_time)) elif td.get_text(): alltds.append(td.get_text().strip()) else: alltds.append(None) if head == u'企业年报': alltds = [] for td in table.find_all('td'): if td.find('a'): onclick = td.a['onclick'] m = re.search(r'doNdbg\(\'(\w+)\'\)', onclick) if m: alltds.append(td.get_text().strip()) alltds.append(self.help_enter_get_dict('ndbgDetail', self.pripid, m.group(1), self.cur_time)) elif td.get_text(): alltds.append(td.get_text().strip()) else: alltds.append(None) allths.insert(2, u'详情') if head == u'动产抵押登记信息': alltds = [] for td in table.find_all('td'): if td.find('a'): onclick = td.a['onclick'] m = re.search(r'doDcdyDetail\(\'(\w+?)\'\)', onclick) if m: alltds.append(self.help_dcdy_get_dict('dcdyDetail', self.pripid, m.group(1), self.cur_time)) elif td.get_text(): alltds.append(td.get_text().strip()) else: alltds.append(None) # if len(alltds) == 0: # alltds = [None for th in allths] return head, allths, alltds def get_one_to_one_dict(self, allths, alltds): if len(allths) == len(alltds): one_to_one_dict = {} for key, value in zip(allths, alltds): one_to_one_dict[key] = value return [one_to_one_dict] else: templist = [] x = 0 y = x + len(allths) while y <= len(alltds): tempdict = {} for keys, values in zip(allths, alltds[x:y]): tempdict[keys] = values x = y y = x + len(allths) templist.append(tempdict) return templist def test_print_table(self, tables): for table in tables: print table def test_print_all_ths_tds(self, head, allths, alltds): print '--------------', head, '--------------' for th in allths: print th for td in alltds: print td def test_print_all_dict(self, mydict): for key, value in mydict.items(): print key, ':', value def get_table_by_head(self, tables, head_item): for table in tables: if table.find_all('th'): temp_head = table.find_all('th')[0].get_text().strip().split('\n')[0].strip() #print 'temp_head', temp_head, 'head_item', head_item if temp_head == head_item: return table # else: # print 'no'*10 pass def get_json_one(self, mydict, tables, *param): #self.test_print_table(tables) for head_item in param: #print '----'*10, head_item table = self.get_table_by_head(tables, head_item) if table: head, allths, alltds = self.get_head_ths_tds(table) #self.test_print_all_ths_tds(head, allths, alltds) self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds) pass def get_json_two(self, mydict, tables): #self.test_print_table(tables) for head_item in param: #print '----'*10, head_item table = self.get_table_by_head(tables, head_item) if table: head, allths, alltds = self.get_head_ths_tds(table) #self.test_print_all_ths_tds(head, allths, alltds) self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds) pass def get_json_three(self, mydict, tables): #self.test_print_table(tables) for head_item in param: #print '----'*10, head_item table = self.get_table_by_head(tables, head_item) if table: head, allths, alltds = self.get_head_ths_tds(table) #self.test_print_all_ths_tds(head, allths, alltds) self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds) pass def get_json_four(self, mydict, tables): #self.test_print_table(tables) for head_item in param: #print '----'*10, head_item table = self.get_table_by_head(tables, head_item) if table: head, allths, alltds = self.get_head_ths_tds(table) #self.test_print_all_ths_tds(head, allths, alltds) self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds) pass def main_page(self): gevent.monkey.patch_socket() sub_json_list = [] for ent, url in self.ents.items(): m = re.search(r"openView\(\'(\w+?)\'", url) if m: self.pripid = m.group(1) self.result_json_dict = {} print self.pripid def qyInfo(): data = {'method': 'qyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk1', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'基本信息', u'股东信息', u'变更信息') def baInfo(): data = {'method': 'baInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk2', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'主要人员信息', u'分支机构信息', u'清算信息') def dcdyInfo(): data = {'method': 'dcdyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'动产抵押登记信息') def gqczxxInfo(): data = {'method': 'gqczxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'股权出质登记信息') def jyycInfo(): data = {'method': 'jyycInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk6', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'经营异常信息') def yzwfInfo(): data = {'method': 'yzwfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk14', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'严重违法信息') def cfInfo(): data = {'method': 'cfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk3', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'行政处罚信息') def ccjcInfo(): data = {'method': 'ccjcInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk7', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'抽查检查信息') def qygsInfo(): data = {'method': 'qygsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk8', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'企业年报') def qygsForTzrxxInfo(): data = {'method': 'qygsForTzrxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk12', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'股东及出资信息', u'变更信息') def cqygsForTzrbgxxInfo(): data = {'method': 'cqygsForTzrbgxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk15', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'股权变更信息') def qygsForXzxkInfo(): data = {'method': 'qygsForXzxkInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk10', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'行政许可信息') def qygsForZzcqInfo(): data = {'method': 'qygsForZzcqInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk11', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'知识产权出质登记信息') def qygsForXzcfInfo(): data = {'method': 'qygsForXzcfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk13', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'行政处罚信息') def qtgsInfo(): data = {'method': 'qtgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk9', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.three_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'行政许可信息') def qtgsForCfInfo(): data = {'method': 'qtgsForCfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk16', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.three_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'行政处罚信息') def sfgsInfo(): data = {'method': 'sfgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk17', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.four_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'司法股权冻结信息') def sfgsbgInfo(): data = {'method': 'sfgsbgInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk18', 'random': self.cur_time} resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout) self.get_json_one(self.four_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'司法股东变更登记信息') threads = [] threads.append(gevent.spawn(qyInfo)) threads.append(gevent.spawn(baInfo)) threads.append(gevent.spawn(dcdyInfo)) threads.append(gevent.spawn(gqczxxInfo)) threads.append(gevent.spawn(jyycInfo)) threads.append(gevent.spawn(yzwfInfo)) threads.append(gevent.spawn(cfInfo)) threads.append(gevent.spawn(ccjcInfo)) threads.append(gevent.spawn(qygsInfo)) threads.append(gevent.spawn(qygsForTzrxxInfo)) threads.append(gevent.spawn(cqygsForTzrbgxxInfo)) threads.append(gevent.spawn(qygsForXzxkInfo)) threads.append(gevent.spawn(qygsForZzcqInfo)) threads.append(gevent.spawn(qygsForXzcfInfo)) threads.append(gevent.spawn(qtgsInfo)) threads.append(gevent.spawn(qtgsForCfInfo)) threads.append(gevent.spawn(sfgsInfo)) threads.append(gevent.spawn(sfgsbgInfo)) gevent.joinall(threads) self.result_json_dict['ind_comm_pub_reg_basic'] = self.result_json_dict['ind_comm_pub_reg_basic'][0] if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys() and len(self.result_json_dict[ 'ind_comm_pub_arch_liquidation']) > 0: self.result_json_dict['ind_comm_pub_arch_liquidation'] = self.result_json_dict[ 'ind_comm_pub_arch_liquidation'][0] sub_json_list.append({ent: self.result_json_dict}) return sub_json_list def run(self, findCode): print self.__class__.__name__ logging.error('crawl %s .', self.__class__.__name__) self.ent_num = str(findCode) if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) if not self.get_id_num(self.ent_num): return json.dumps([{self.ent_num: None}]) data = self.main_page() return json.dumps(data)
class HeilongjiangClawer(Crawler): """黑龙江工商公示信息网页爬虫 """ # html数据的存储路径 html_restore_path = settings.json_restore_path + '/heilongjiang/' # 验证码图片的存储路径 ckcode_image_path = settings.json_restore_path + '/heilongjiang/ckcode.jpg' code_cracker = CaptchaRecognition('heilongjiang') # 多线程爬取时往最后的json文件中写时的加锁保护 write_file_mutex = threading.Lock() urls = { 'host': 'www.hljaic.gov.cn', 'get_checkcode': 'http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0', 'post_checkcode': 'http://gsxt.hljaic.gov.cn/checkCheckNo.jspx', 'get_info_entry': 'http://gsxt.hljaic.gov.cn/searchList.jspx', 'ind_comm_pub_skeleton': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=', 'ent_pub_skeleton': 'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=', 'other_dept_pub_skeleton': 'http://gsxt.hljaic.gov.cn/otherDepartment.jspx?id=', 'judical_assist_skeleton': 'http://gsxt.hljaic.gov.cn/justiceAssistance.jspx?id=', 'ind_comm_pub_reg_shareholder': 'http://gsxt.hljaic.gov.cn/QueryInvList.jspx?', # 股东信息 'ind_comm_pub_reg_modify': 'http://gsxt.hljaic.gov.cn/QueryAltList.jspx?', # 变更信息翻页 'ind_comm_pub_arch_key_persons': 'http://gsxt.hljaic.gov.cn/QueryMemList.jspx?', # 主要人员信息翻页 'ind_comm_pub_spot_check': 'http://gsxt.hljaic.gov.cn/QuerySpotCheckList.jspx?', # 抽样检查信息翻页 'ind_comm_pub_movable_property_reg': 'http://gsxt.hljaic.gov.cn/QueryMortList.jspx?', # 动产抵押登记信息翻页 'ind_comm_pub_business_exception': 'http://gsxt.hljaic.gov.cn/QueryExcList.jspx?', # 经营异常信息 'shareholder_detail': 'http://gsxt.hljaic.gov.cn/queryInvDetailAction.jspx?id=', # 投资人详情 'movable_property_reg_detail': 'http://gsxt.hljaic.gov.cn/mortInfoDetail.jspx?id=', # 动产抵押登记详情 'annual_report': 'http://gsxt.hljaic.gov.cn/QueryYearExamineDetail.jspx?id=', # 企业年报详情 } def __init__(self, json_restore_path): self.json_restore_path = json_restore_path self.parser = HeilongjiangParser(self) self.img_count = 1 if not os.path.exists(self.html_restore_path): os.makedirs(self.html_restore_path) def run(self, ent_number=0): """爬取的主函数 """ return Crawler.run(self, ent_number) # return super(HeilongjiangClawer, self).run(ent_number) def crawl_check_page(self): """爬取验证码页面,包括下载验证码图片以及破解验证码 :return true or false """ count = 0 while count < 10: ck_code = self.crack_check_code() data = {'checkNo': ck_code} resp = self.reqst.post(self.urls['post_checkcode'], data=data) if resp.status_code != 200: logging.error("crawl post check page failed!") count += 1 continue if resp.content[10] == 't': data = {'checkNo': ck_code, 'entName': self.ent_number} resp = self.reqst.post(self.urls['get_info_entry'], data=data) soup = BeautifulSoup(resp.text, "html5lib") div = soup.find("div", {"style": "height:500px;"}) a = div.find("a") if a: company_id = a["href"].split('?')[1] self.company_id = company_id.split("=")[1] return True else: return False else: logging.error("crawl post check page failed!") count += 1 continue return False def crack_check_code(self): """破解验证码 :return 破解后的验证码 """ resp = self.reqst.get(self.urls['get_checkcode']) if resp.status_code != 200: logging.error('failed to get get_checkcode') return None time.sleep(random.uniform(2, 4)) self.write_file_mutex.acquire() with open(self.ckcode_image_path, 'wb') as f: f.write(resp.content) try: ckcode = self.code_cracker.predict_result(self.ckcode_image_path) except Exception as e: logging.warn('exception occured when crack checkcode') ckcode = ('', '') finally: pass self.write_file_mutex.release() return ckcode[1] def crawl_ind_comm_pub_pages(self): """爬取工商公示信息 """ url = "%s%s" % (self.urls['ind_comm_pub_skeleton'], self.company_id) resp = self.reqst.get(url) if resp.status_code != 200: logging.error('failed to get ind_comm_pub_skeleton') self.parser.parse_ind_comm_pub_pages(resp.content) def crawl_ent_pub_pages(self): """爬取企业公示信息 """ url = "%s%s" % (self.urls['ent_pub_skeleton'], self.company_id) resp = self.reqst.get(url) if resp.status_code != 200: logging.error('failed to get ent_pub_skeleton') self.parser.parse_ent_pub_pages(resp.content) def crawl_other_dept_pub_pages(self): """爬取其他部门公示信息 """ url = "%s%s" % (self.urls['other_dept_pub_skeleton'], self.company_id) resp = self.reqst.get(url) if resp.status_code != 200: logging.error('failed to get other_dept_pub_skeleton') self.parser.crawl_other_dept_pub_pages(resp.content) def crawl_judical_assist_pub_pages(self): """爬取司法协助信息 """ url = "%s%s" % (self.urls['judical_assist_skeleton'], self.company_id) resp = self.reqst.get(url) if resp.status_code != 200: logging.error('failed to get judical_assist_skeleton') self.parser.parse_judical_assist_pub_pages(resp.content)
def __init__(self, json_restore_path): # self.cur_time = str(int(time.time()*1000)) self.id = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.ckcode_image_path = settings.json_restore_path + '/guangxi/ckcode.jpg' self.code_cracker = CaptchaRecognition('guangxi') self.result_json_dict = {} self.reqst.headers.update({ 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0' }) self.mydict = { 'eareName': 'http://gxqyxygs.gov.cn', 'search': 'http://gxqyxygs.gov.cn/search.jspx', 'searchList': 'http://gxqyxygs.gov.cn/searchList.jspx', 'validateCode': 'http://gxqyxygs.gov.cn/validateCode.jspx?type=0&id=0.6145392225593206' } self.search_dict = { 'eareName': 'http://gxqyxygs.gov.cn', 'search': 'http://222.143.24.157/search.jspx', 'validateCode': 'http://222.143.24.157/validateCode.jspx?type=0&id=0.8720359673599201', 'searchList': 'http://222.143.24.157/searchList.jspx', 'businessPublicity': 'http://222.143.24.157/businessPublicity.jspx?', 'enterprisePublicity': 'http://222.143.24.157/enterprisePublicity.jspx?', 'otherDepartment': 'http://222.143.24.157/otherDepartment.jspx?', 'justiceAssistance': 'http://222.143.24.157/justiceAssistance.jspx?', 'next_head': 'http://gxqyxygs.gov.cn/Query' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {}