def __init__(self, json_restore_path=None): headers = { #'Connetion': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', "User-Agent": get_user_agent() } self.CR = CaptchaRecognition("hebei") self.requests = requests.Session() self.requests.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.requests.mount('http://', adapter) self.ents = {} self.json_dict = {} self.json_restore_path = json_restore_path self.csrf = "" #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/hebei/' self.proxies = get_proxy('hebei') self.timeout = (30, 20)
def __init__(self, json_restore_path=None): super(XinjiangCrawler, self).__init__(json_restore_path) self.json_restore_path = json_restore_path #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/xinjiang/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/xinjiang/' self.proxies = get_proxy('xinjiang')
def __init__(self, json_restore_path=None): super(YunnanCrawler, self).__init__(json_restore_path) self.json_restore_path = json_restore_path #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/yunnan/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/yunnan/' self.proxies = get_proxy('yunnan')
def __init__(self, json_restore_path=None): super(HubeiCrawler, self).__init__(json_restore_path) self.json_restore_path = json_restore_path #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/hubei/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/hubei/ckcode.jpg' self.parser = HubeiParser(self) self.proxies = get_proxy('hubei')
def __init__(self, json_restore_path=None): super(ShanghaiCrawler, self).__init__(json_restore_path) self.json_restore_path = json_restore_path #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/Shanghai/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/Shanghai/' self.proxies = get_proxy('shanghai')
def __init__(self, json_restore_path=None): super(XizangCrawler, self).__init__(json_restore_path) self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/xizang/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/xizang/ckcode.jpg' self.parser = XizangParser(self) self.proxies = get_proxy('xizang')
def __init__(self, json_restore_path=None): super(LiaoningCrawler, self).__init__() self.json_restore_path = json_restore_path #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/liaoning/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/liaoning/ckcode.jpg' self.parser = LiaoningParser(self) self.proxies = get_proxy('liaoning') self.timeout = (30, 20)
def __init__(self, json_restore_path=None): super(ZongjuCrawler, self).__init__() self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/zongju/' # 验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/zongju/ckcode.jpg' self.parser = ZongjuParser(self) self.proxies = get_proxy('beijing') self.timeout = (30, 20)
def __init__(self, json_restore_path=None): super(HainanCrawler, self).__init__(json_restore_path) # HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/hainan/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/hainan/ckcode.jpg' self.parser = HainanParser(self) self.proxies = get_proxy('hainan')
def __init__(self, json_restore_path=None): super(ShanxiCrawler, self).__init__(json_restore_path) # HeilongjiangClawer.__init__(self, json_restore_path) self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/shanxi/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/shanxi/ckcode.jpg' self.parser = ShanxiParser(self) self.proxies = get_proxy('shanxi')
def __init__(self, json_restore_path=None): # Crawler.__init__(self) super(HeilongjiangClawer, self).__init__() self.json_restore_path = json_restore_path # html数据的存储路径 self.html_restore_path = self.json_restore_path + '/heilongjiang/' # 验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/heilongjiang/ckcode.jpg' self.parser = HeilongjiangParser(self) self.proxies = get_proxy('heilongjiang') self.timeout = (30, 20)
def __init__(self, json_restore_path=None): self.html_showInfo = None self.Captcha = None self.CR = CaptchaRecognition("guangdong") self.requests = requests.Session() self.requests.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.requests.mount('http://', adapter) self.ents = {} self.json_restore_path = json_restore_path self.dir_restore_path = self.json_restore_path + '/neimenggu/' #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/neimenggu/ckcode.jpg' self.timeout = (30, 20) proxies = get_proxy('neimenggu') if proxies: print proxies self.requests.proxies = proxies
def __init__(self, json_restore_path=None): self.pripid = None self.cur_time = str(int(time.time() * 1000)) self.reqst = requests.Session() self.reqst.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.reqst.mount('http://', adapter) self.json_restore_path = json_restore_path self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/sichuan/' self.code_cracker = CaptchaRecognition('sichuan') self.result_json_dict = {} self.json_list = [] proxies = get_proxy('shaanxi') if proxies: print proxies self.reqst.proxies = proxies self.timeout = (30, 20) self.ents = {} self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=', 'searchList': 'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=', 'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm' } self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check'} self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction'} self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'}
def __init__(self, json_restore_path=None): """ 初始化函数 Args: json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时, 需要在写入文件的时候加锁 Returns: """ super(JiangsuCrawler, self).__init__() self.json_restore_path = json_restore_path #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/jiangsu/' #验证码图片的存储路径 self.ckcode_image_path = self.json_restore_path + '/jiangsu/ckcode.jpg' # self.proxies = {} proxies = get_proxy('jiangsu') if proxies: print proxies self.reqst.proxies = proxies self.timeout = (30, 20) self.parser = JiangsuParser(self) self.corp_org = '' self.corp_id = '' self.corp_seq_id = '' self.common_enter_post_data = {} self.ci_enter_post_data = {} self.nb_enter_post_data = {} self.post_info = { 'ind_comm_pub_reg_basic': { 'url_type': 'ci_enter', 'post_type': 'ci_enter', 'specificQuery': 'basicInfo' }, 'ind_comm_pub_reg_shareholder': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'investmentInfor' }, 'ind_comm_pub_reg_modify': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'biangeng' }, 'ind_comm_pub_arch_key_persons': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'personnelInformation' }, 'ind_comm_pub_arch_branch': { 'url_type': 'ci_enter', 'post_type': 'ci_enter_with_recordline', 'specificQuery': 'branchOfficeInfor' }, #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'}, 'ind_comm_pub_movable_property_reg': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'dongchan' }, 'ind_comm_pub_equity_ownership_reg': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'guquanchuzhi' }, 'ind_comm_pub_administration_sanction': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'chufa' }, 'ind_comm_pub_business_exception': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'abnormalInfor' }, #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'}, 'ind_comm_pub_spot_check': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'checkup' }, 'ind_comm_pub_reg_shareholder_detail': { 'url_type': 'ci_detail', 'post_type': 'ci_detail', 'specificQuery': 'investorInfor' }, 'ent_pub_annual_report': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_report_list' }, 'annual_report_detail': { 'url_type': 'nb_enter', 'post_type': 'nb_enter' }, 'ent_pub_shareholder_capital_contribution': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_tzcz' }, 'ent_pub_administrative_license': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_xzxk' }, 'ent_pub_knowledge_property': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_zscq' }, 'ent_pub_administration_sanction': { 'url_type': 'nb_enter', 'post_type': 'nb_enter', 'propertiesName': 'query_xzcf' }, 'other_dept_pub_administration_license': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xingzheng' }, 'other_dept_pub_administration_sanction': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xingzhengchufa' }, 'judical_assist_pub_equity_freeze': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'gqdjList' }, 'judical_assist_pub_shareholder_modify': { 'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'gdbgList' } }