Пример #1
0
    def __init__(self, json_restore_path=None):
        headers = {  #'Connetion': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            "User-Agent": get_user_agent()
        }
        self.CR = CaptchaRecognition("hebei")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.csrf = ""
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/hebei/'

        self.proxies = get_proxy('hebei')

        self.timeout = (30, 20)
Пример #2
0
class HunanCrawler(ZongjuCrawler):
    """湖南爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/hunan/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/hunan/ckcode.jpg'
    code_cracker = CaptchaRecognition('hunan')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {'host': 'http://www.hnaic.net.cn/visit/category/a/hnaicalllist',
            'official_site': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list',
            'get_checkcode': 'http://gsxt.hnaic.gov.cn/notice/captcha?preset=',
            'post_checkcode': 'http://gsxt.hnaic.gov.cn/notice/search/popup_captcha',

            'get_info_entry': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list',  # 获得企业入口
            'open_info_entry': 'http://gsxt.hnaic.gov.cn/notice/notice/view?',
            # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...)
            'open_detail_info_entry': '',
            }
    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HunanParser(self)
Пример #3
0
 def __init__(self, *args, **kwargs):
     self.ckcode_image_path = settings.json_restore_path + '/anhui/ckcode.jpg'
     self.code_cracker = CaptchaRecognition('qinghai')
     if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
         os.makedirs(os.path.dirname(self.ckcode_image_path))
     self.urls = {
         'eareName': 'http://www.ahcredit.gov.cn',
         'search': 'http://www.ahcredit.gov.cn/search.jspx',
         'checkCheckNo': 'http://www.ahcredit.gov.cn/checkCheckNo.jspx',
         'searchList': 'http://www.ahcredit.gov.cn/searchList.jspx',
         'validateCode':
         'http://www.ahcredit.gov.cn/validateCode.jspx?type=0&id=0.22788021906613765',
         'QueryInvList': 'http://www.ahcredit.gov.cn/QueryInvList.jspx?',
         'queryInvDetailAction':
         'http://www.ahcredit.gov.cn/queryInvDetailAction.jspx?',
         'businessPublicity':
         'http://www.ahcredit.gov.cn/businessPublicity.jspx?',
         'enterprisePublicity':
         'http://www.ahcredit.gov.cn/enterprisePublicity.jspx?',
         'otherDepartment':
         'http://www.ahcredit.gov.cn/otherDepartment.jspx?',
         'justiceAssistance':
         'http://www.ahcredit.gov.cn/justiceAssistance.jspx?'
     }
     self.timeout = 30
     self.result_json = {}
     self.result_json_list = []
     pass
Пример #4
0
class FujianCrawler(ZongjuCrawler):
    """福建爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/fujian/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/fujian/ckcode.jpg'
    code_cracker = CaptchaRecognition('fujian')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host': 'http://www.fjaic.gov.cn/',
        'official_site': 'http://wsgs.fjaic.gov.cn/creditpub/home',
        'get_checkcode':
        'http://wsgs.fjaic.gov.cn/creditpub/captcha?preset=math-01',
        'post_checkcode':
        'http://wsgs.fjaic.gov.cn/creditpub/security/verify_captcha',
        'get_info_entry':
        'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list',
        'open_info_entry':
        'http://wsgs.fjaic.gov.cn/creditpub/notice/view?',  #获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...)
        'open_detail_info_entry': '',
    }

    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = FujianParser(self)
Пример #5
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     from CaptchaRecognition import CaptchaRecognition
     self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json')
     self.parser = self.crawler.parser
     ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
     self.crawler.json_dict = {}
     self.crawler.ent_number = '500232000003942'
Пример #6
0
class HubeiCrawler(HeilongjiangClawer):
    """湖北爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/hubei/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/hubei/ckcode.jpg'
    code_cracker = CaptchaRecognition('hubei')

    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.hljaic.gov.cn',
        'get_checkcode':
        'http://xyjg.egs.gov.cn/ECPS_HB/validateCode.jspx?type=0',
        'post_checkcode':
        'http://xyjg.egs.gov.cn/ECPS_HB/checkCheckNo.jspx',
        'get_info_entry':
        'http://xyjg.egs.gov.cn/ECPS_HB/searchList.jspx',
        'ind_comm_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/businessPublicity.jspx?id=',
        'ent_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/enterprisePublicity.jspx?id=',
        'other_dept_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/otherDepartment.jspx?id=',
        'judical_assist_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/justiceAssistance.jspx?id=',
        'ind_comm_pub_reg_shareholder':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryInvList.jspx?',  # 股东信息
        'ind_comm_pub_reg_modify':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryAltList.jspx?',  # 变更信息翻页
        'ind_comm_pub_arch_key_persons':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryMemList.jspx?',  # 主要人员信息翻页
        'ind_comm_pub_spot_check':
        'http://xyjg.egs.gov.cn/ECPS_HB/QuerySpotCheckList.jspx?',  # 抽样检查信息翻页
        'ind_comm_pub_movable_property_reg':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryMortList.jspx?',  # 动产抵押登记信息翻页
        'ind_comm_pub_business_exception':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryExcList.jspx?',  # 经营异常信息
        'ind_comm_pub_equity_ownership_reg':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryPledgeList.jspx?',  # 股权出质登记信息翻页
        'ind_comm_pub_arch_branch':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryChildList.jspx?',  # 分支机构信息
        'shareholder_detail':
        'http://xyjg.egs.gov.cn/ECPS_HB/queryInvDetailAction.jspx?id=',  # 投资人详情
        'movable_property_reg_detail':
        'http://xyjg.egs.gov.cn/ECPS_HB/mortInfoDetail.jspx?id=',  # 动产抵押登记详情
        'annual_report':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HubeiParser(self)
Пример #7
0
 def __init__(self, json_restore_path):
     self.CR = CaptchaRecognition("hebei")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.json_restore_path = json_restore_path
     self.csrf = ""
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/hebei/ckcode.jpeg'
     #html数据的存储路径
     self.html_restore_path = settings.json_restore_path + '/hebei/'
Пример #8
0
 def __init__(self, json_restore_path):
     self.CR = CaptchaRecognition("guangdong")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.main_host = ""
     self.json_dict = {}
     self.json_restore_path = json_restore_path
     self.html_restore_path = settings.json_restore_path + '/hainan/'
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/hainan/ckcode.png'
Пример #9
0
 def __init__(self, json_restore_path=None):
     self.html_search = None
     self.html_showInfo = None
     self.Captcha = None
     self.CR = CaptchaRecognition("guangdong")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.main_host = ""
     self.json_dict = {}
     self.json_restore_path = json_restore_path
     self.dir_restore_path = settings.json_restore_path + '/guangdong/'
     #self.json_restore_path = settings.json_restore_path + '/guangdong.json'
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/guangdong/ckcode.jpg'
Пример #10
0
    def __init__(self, json_restore_path):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
            需要在写入文件的时候加锁
        Returns:
        """
        # json 数据集
        # POST

        self.json_restore_path = json_restore_path
        if os.path.exists(self.json_restore_path) is False:
            os.makedirs(self.json_restore_path, 0775)
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path, "/chongqing/")
        if os.path.exists(self.html_restore_path) is False:
            os.makedirs(self.html_restore_path, 0775)
        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.ent_number = None
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.parser = ChongqingParser(self)
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'})
Пример #11
0
    def __init__(self, *args, **kwargs):
        """江苏工商公示信息网页爬虫初始化函数
		Args:
			json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
			 需要在写入文件的时候加锁
		Returns:
		"""
        self.ent_number = None
        #html数据的存储路径
        self.html_restore_path = settings.json_restore_path + '/jiangsu/'
        # 验证码图片的存储路径
        self.ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg'
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        self.code_cracker = CaptchaRecognition('jiangsu')
        #多线程爬取时往最后的json文件中写时的加锁保护
        self.write_file_mutex = threading.Lock()

        self.urls = {
            'host': 'www.jsgsj.gov.cn',
            'official_site': 'http://www.jsgsj.gov.cn:58888/province/',
            'get_checkcode':
            'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7',
            'post_checkcode':
            'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true',
            'ind_comm_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp',
            'ent_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp',
            'other_dept_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp',
            'judical_assist_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp',
            'annual_report_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp',
            'ci_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true',
            'common_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true',
            'nb_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true',
            'ci_detail':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true'
        }
        self.result_json = {}
        self.result_json_list = []
Пример #12
0
    def __init__(self, json_restore_path=None):
        self.html_showInfo = None
        self.Captcha = None
        self.CR = CaptchaRecognition("guangdong")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_restore_path = json_restore_path
        self.dir_restore_path = self.json_restore_path + '/neimenggu/'
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/neimenggu/ckcode.jpg'
        self.timeout = (30, 20)
        proxies = get_proxy('neimenggu')
        if proxies:
            print proxies
            self.requests.proxies = proxies
Пример #13
0
    def __init__(self, *args, **kwargs):
        # 验证码图片的存储路径
        self.ckcode_image_path = settings.json_restore_path + '/zongju/ckcode.jpg'

        self.code_cracker = CaptchaRecognition('zongju')
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        # 多线程爬取时往最后的json文件中写时的加锁保护
        self.write_file_mutex = threading.Lock()
        self.timeout = 40
        self.urls = {
            'host': 'http://qyxy.baic.gov.cn',
            'official_site': 'http://gsxt.saic.gov.cn/zjgs/',
            'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=',
            'post_checkcode':
            'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',
            # 'get_info_entry': 'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',  # 获得企业入口
            'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?',
            # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...)
            'open_detail_info_entry': ''
        }
Пример #14
0
    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径,所有重庆的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
            需要在写入文件的时候加锁
        Returns:
        """
        super(ChongqingCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path,
                                              "chongqing")

        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path,
                                              'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        self.ent_number = None
        self.ents = {}
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None
Пример #15
0
    def __init__(self, json_restore_path=None):
        self.cur_time = str(int(time.time() * 1000))
        self.nbxh = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.html_restore_path = self.json_restore_path + '/guizhou/'
        self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guizhou')
        self.result_json_dict = {}
        self.reqst.headers.update({
            'Connection': "keep-alive",
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': get_user_agent()
        })

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.gzgs.gov.cn/',
            'searchList':
            'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml',
            'validateCode':
            'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}
Пример #16
0
    def __init__(self, json_restore_path=None):
        self.pripid = None
        self.cur_time = str(int(time.time() * 1000))
        self.reqst = requests.Session()
        self.reqst.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/sichuan/'
        self.code_cracker = CaptchaRecognition('sichuan')
        self.result_json_dict = {}
        self.json_list = []

        proxies = get_proxy('shaanxi')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.ents = {}

        self.mydict = {
            'eareName': 'http://www.ahcredit.gov.cn',
            'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=',
            'searchList':
            'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=',
            'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm'
        }

        self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic',
                         u'股东信息': 'ind_comm_pub_reg_shareholder',
                         u'发起人信息': 'ind_comm_pub_reg_shareholder',
                         u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
                         u'变更信息': 'ind_comm_pub_reg_modify',
                         u'主要人员信息': 'ind_comm_pub_arch_key_persons',
                         u'分支机构信息': 'ind_comm_pub_arch_branch',
                         u'清算信息': 'ind_comm_pub_arch_liquidation',
                         u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
                         u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'行政处罚信息': 'ind_comm_pub_administration_sanction',
                         u'经营异常信息': 'ind_comm_pub_business_exception',
                         u'严重违法信息': 'ind_comm_pub_serious_violate_law',
                         u'抽查检查信息': 'ind_comm_pub_spot_check'}

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license',
                           u'行政处罚信息': 'other_dept_pub_administration_sanction'}
        self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'股东变更信息': 'judical_assist_pub_shareholder_modify',
                          u'司法股东变更登记信息':
                          'judical_assist_pub_shareholder_modify'}
Пример #17
0
class BeijingCrawler(Crawler):
    """北京工商爬虫
    """
    code_cracker = CaptchaRecognition('beijing')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'http://qyxy.baic.gov.cn',
        'official_site':
        'http://qyxy.baic.gov.cn/beijing',
        'get_checkcode':
        'http://qyxy.baic.gov.cn',
        'post_checkcode':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!checkCode.dhtml',
        'open_info_entry':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!getBjQyList.dhtml',
        'ind_comm_pub_reg_basic':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!openEntInfo.dhtml?',
        'ind_comm_pub_reg_shareholder':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!tzrFrame.dhtml?',
        'ind_comm_pub_reg_modify':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!biangengFrame.dhtml?',
        'ind_comm_pub_arch_key_persons':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!zyryFrame.dhtml?',
        'ind_comm_pub_arch_branch':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!fzjgFrame.dhtml?',
        'ind_comm_pub_arch_liquidation':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!qsxxFrame.dhtml?',
        'ind_comm_pub_movable_property_reg':
        'http://qyxy.baic.gov.cn/gjjbjTab/gjjTabQueryCreditAction!dcdyFrame.dhtml?',
        'ind_comm_pub_equity_ownership_reg':
        'http://qyxy.baic.gov.cn/gdczdj/gdczdjAction!gdczdjFrame.dhtml?',
        'ind_comm_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list.dhtml?',
        'ind_comm_pub_business_exception':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_jyycxx.dhtml?',
        'ind_comm_pub_serious_violate_law':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_yzwfxx.dhtml?',
        'ind_comm_pub_spot_check':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_ccjcxx.dhtml?',
        'ent_pub_ent_annual_report':
        'http://qyxy.baic.gov.cn/qynb/entinfoAction!qyxx.dhtml?',
        'ent_pub_shareholder_capital_contribution':
        'http://qyxy.baic.gov.cn/gdcz/gdczAction!list_index.dhtml?',
        'ent_pub_equity_change':
        'http://qyxy.baic.gov.cn/gdgq/gdgqAction!gdgqzrxxFrame.dhtml?',
        'ent_pub_administration_license':
        'http://qyxy.baic.gov.cn/xzxk/xzxkAction!list_index.dhtml?',
        'ent_pub_knowledge_property':
        'http://qyxy.baic.gov.cn/zscqczdj/zscqczdjAction!list_index.dhtml?',
        'ent_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/gdgq/gdgqAction!qyxzcfFrame.dhtml?',
        'other_dept_pub_administration_license':
        'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzxk.dhtml?',
        'other_dept_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzcf.dhtml?',
        'shareholder_detail':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!touzirenInfo.dhtml?'
    }

    def __init__(self, json_restore_path=None):
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        html_restore_path = self.json_restore_path + '/beijing/'

        #验证码图片的存储路径
        ckcode_image_path = self.json_restore_path + '/beijing/ckcode.jpg'
        self.parser = BeijingParser(self)
        self.credit_ticket = None
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        self.timeout = 20

    def run(self, ent_number):
        """爬取的主函数
        """
        self.ent_id = ''
        return Crawler.run(self, ent_number)

    def crawl_page_by_url(self, url):
        resp = None
        try:
            resp = self.reqst.get(url,
                                  timeout=self.timeout,
                                  proxies=self.proxies)
        except requests.exceptions.ConnectionError:
            self.proxies = Proxies().get_proxies()
            logging.error("get method self.proxies changed proxies = %s\n" %
                          (self.proxies))
            return self.crawl_page_by_url(url)
        except requests.exceptions.Timeout:
            self.timeout += 5
            logging.error(
                "get method self.timeout plus timeout = %d, proxies= %s\n" %
                (self.timeout, self.proxies))
            return self.crawl_page_by_url(url)
        except Exception as e:
            logging.error("Other exception occured!type e = %s, proxies=%s\n" %
                          (type(e), self.proxies))
        return resp

    def crawl_page_by_url_post(self, url, data):
        resp = None
        try:
            resp = self.reqst.post(url,
                                   data,
                                   timeout=self.timeout,
                                   proxies=self.proxies)
        except requests.exceptions.ConnectionError:
            self.proxies = Proxies().get_proxies()
            logging.error("post method self.proxies changed. proxies =  %s\n" %
                          (self.proxies))
            return self.crawl_page_by_url_post(url, data)
        except requests.exceptions.Timeout:
            self.timeout += 5
            logging.error(
                "post method self.timeout plus, timeout= %d, proxies= %s\n" %
                (self.timeout, self.proxies))
            return self.crawl_page_by_url_post(url, data)
        except Exception as e:
            logging.error(
                "Other exception occured!type e = %s, proxies=%s \n" %
                (type(e), self.proxies))
        return resp

    def crawl_check_page(self):
        """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交
        """
        resp = self.crawl_page_by_url(self.urls['official_site'])
        if resp.status_code != 200:
            logging.error('failed to get official site page!')
            return False
        count = 0
        while count < 15:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                logging.error(
                    'failed to get crackcode result, fail count = %d' %
                    (count))
                continue

            post_data = {
                'currentTimeMillis': self.time_stamp,
                'credit_ticket': self.credit_ticket,
                'checkcode': ckcode[1],
                'keyword': self.ent_number
            }
            next_url = self.urls['post_checkcode']
            resp = self.crawl_page_by_url_post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to get crackcode image by url %s, fail count = %d'
                    % (next_url, count))
                continue

            logging.error('crack code = %s, %s, response =  %s' %
                          (ckcode[0], ckcode[1], resp.content))

            if resp.content == 'fail':
                logging.error(
                    'crack checkcode failed, response content = failed, total fail count = %d'
                    % count)
                time.sleep(random.uniform(0.1, 2))
                continue

            next_url = self.urls['open_info_entry']
            resp = self.crawl_page_by_url_post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to open info entry by url %s, fail count = %d' %
                    (next_url, count))
                continue

            crack_result = self.parse_post_check_page(resp.content)
            if crack_result:
                return True
            else:
                logging.error('crack checkcode failed, total fail count = %d' %
                              count)
            time.sleep(random.uniform(3, 5))
        return False

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息页面
        """
        for item in (
                'ind_comm_pub_reg_basic',  # 登记信息-基本信息
                'ind_comm_pub_reg_shareholder',  # 股东信息
                'ind_comm_pub_reg_modify',
                'ind_comm_pub_arch_key_persons',  # 备案信息-主要人员信息
                'ind_comm_pub_arch_branch',  # 备案信息-分支机构信息
                'ind_comm_pub_arch_liquidation',  # 备案信息-清算信息
                'ind_comm_pub_movable_property_reg',  # 动产抵押登记信息
                'ind_comm_pub_equity_ownership_reg',  # 股权出置登记信息
                'ind_comm_pub_administration_sanction',  # 行政处罚信息
                'ind_comm_pub_business_exception',  # 经营异常信息
                'ind_comm_pub_serious_violate_law',  # 严重违法信息
                'ind_comm_pub_spot_check'  # 抽查检查信息
        ):
            self.get_page_json_data(item, 1)
        time.sleep(random.uniform(0, 3))

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息页面
        """
        for item in (
                'ent_pub_ent_annual_report',
                'ent_pub_shareholder_capital_contribution',  #企业投资人出资比例
                'ent_pub_equity_change',  #股权变更信息
                'ent_pub_administration_license',  #行政许可信息
                'ent_pub_knowledge_property',  #知识产权出资登记
                'ent_pub_administration_sanction'  #行政许可信息
        ):
            self.get_page_json_data(item, 2)
        time.sleep(random.uniform(0, 3))

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息页面
        """
        for item in (
                'other_dept_pub_administration_license',  #行政许可信息
                'other_dept_pub_administration_sanction'  #行政处罚信息
        ):
            self.get_page_json_data(item, 3)

    def crawl_judical_assist_pub_pages(self):
        """爬取司法协助信息页面
        """
        pass

    def get_page_json_data(self, page_name, page_type):
        """获得页面的解析后的json格式数据
        Args:
            page_name: 页面名称
            page_type: 页面类型, 1 工商公示页面, 2 企业公示页面, 3 其他部门公示页面
        """
        page = self.get_page(page_name, page_type)
        pages = self.get_all_pages_of_a_section(page, page_name)
        if len(pages) == 1:
            self.json_dict[page_name] = {}
            json_data = self.parser.parse_page(page, page_name)
            if json_data:
                self.json_dict[page_name] = json_data
        else:
            self.json_dict[page_name] = []
            for p in pages:
                json_data = self.parser.parse_page(p, page_name)
                if json_data:
                    self.json_dict[page_name] += json_data

    def get_checkcode_url(self):
        count = 0
        while count < 5:
            count += 1
            resp = self.crawl_page_by_url(self.urls['official_site'])
            time.sleep(random.uniform(1, 5))
            if resp.status_code != 200:
                logging.error('failed to get crackcode url')
                continue
            response = resp.content
            soup = BeautifulSoup(response, 'html.parser')
            ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src')
            ckimg_src = str(ckimg_src)
            re_checkcode_captcha = re.compile(r'/([\s\S]*)\?currentTimeMillis')
            # re_currenttime_millis=re.compile(r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)')
            checkcode_type = re_checkcode_captcha.findall(ckimg_src)[0]

            if checkcode_type == 'CheckCodeCaptcha':
                #parse the pre check page, get useful information
                self.parse_pre_check_page(response)
                checkcode_url = self.urls['get_checkcode'] + ckimg_src
                return checkcode_url

            # elif checkcode_type == 'CheckCodeYunSuan':
            logging.error(
                'can not get CheckCodeCaptcha type of checkcode img, count times = %d \n'
                % (count))
        return None

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面,获取必要的信息
        """
        if page == 'fail':
            logging.error('checkcode error!')
            # if senting_open:
            #     senting_client.captureMessage('checkcode error!')
            return False

        soup = BeautifulSoup(page, 'html.parser')
        r = soup.find_all('a', {
            'href': "#",
            'onclick': re.compile(r'openEntInfo')
        })

        ent = ''
        if r:
            ent = r[0]['onclick']
        else:
            logging.error('fail to find openEntInfo')
            return False

        m = re.search(r'\'([\w]*)\'[ ,]+\'([\w]*)\'[ ,]+\'([\w]*)\'', ent)
        if m:
            self.ent_id = m.group(1)
            self.credit_ticket = m.group(3)

        r = soup.find_all(
            'input', {
                'type': "hidden",
                'name': "currentTimeMillis",
                'id': "currentTimeMillis"
            })
        if r:
            self.time_stamp = r[0]['value']
        else:
            logging.error('fail to get time stamp')
        return True

    def parse_pre_check_page(self, page):
        """解析提交验证码之前的页面
        """
        soup = BeautifulSoup(page, 'html.parser')
        ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src')
        ckimg_src = str(ckimg_src)
        re_currenttime_millis = re.compile(
            r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)')
        self.credit_ticket = soup.find_all('input',
                                           id='credit_ticket')[0].get('value')
        self.time_stamp = re_currenttime_millis.findall(ckimg_src)[0]
        # self.time_stamp = self.generate_time_stamp()

    """
    def crawl_page_by_url(self, url):
        resp = self.crawl_page_by_url(url)
        if resp.status_code != 200:
            logging.error('failed to crawl page by url' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        # if saveingtml:
        #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
        return page
    """

    def get_all_pages_of_a_section(self, page, type, url=None):
        """获取页面上含有 上一页、下一页跳转链接的区域的所有的数据
        Args:
            page: 已经爬取的页面
            type: 页面类型
            url: 该页面的url,默认为None,因为一般可以通过 type 从 BeijingCrawler.urls 中找到
        Returns:
            pages: 所有页面的列表
        """
        if not page:
            return page
        soup = BeautifulSoup(page, 'html.parser')
        page_count = 0
        page_size = 0
        pages_data = []
        pages_data.append(page)
        r1 = soup.find_all('input', {'type': 'hidden', 'id': 'pagescount'})
        r2 = soup.find_all('input', {
            'type': 'hidden',
            'id': 'pageSize',
            'name': 'pageSize'
        })
        if r1 and r2:
            page_count = int(r1[0].get('value'))
            page_size = int(r2[0].get('value'))
        else:
            #只有一页
            return pages_data

        if page_count <= 1:
            return pages_data

        if not url:
            next_url = self.urls[type].rstrip('?')
        else:
            next_url = url

        for p in range(1, page_count):
            post_data = {
                'pageNos': str(p + 1),
                'clear': '',
                'pageNo': str(p),
                'pageSize': str(page_size),
                'ent_id': self.ent_id
            }
            try:
                resp = self.crawl_page_by_url_post(next_url, data=post_data)
                if resp.status_code != 200:
                    logging.error('failed to get all page of a section')
                    return pages_data
                page = resp.content
                time.sleep(random.uniform(0.2, 1))
            except Exception as e:
                logging.error(
                    'open new tab page failed, url = %s, page_num = %d' %
                    (next_url, p + 1))
                page = None
                raise e
            finally:
                if page:
                    pages_data.append(page)
        return pages_data

    def get_page(self, type, tab):
        """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响
        Args:
            tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息
        """
        url = CrawlerUtils.add_params_to_url(
            self.urls[type], {
                'entId': self.ent_id,
                'ent_id': self.ent_id,
                'entid': self.ent_id,
                'credit_ticket': self.credit_ticket,
                'entNo': self.ent_number,
                'entName': '',
                'timeStamp': self.generate_time_stamp(),
                'clear': 'true',
                'str': tab
            })
        logging.error('get %s, url:\n%s\n' % (type, url))
        resp = self.crawl_page_by_url(url)
        if resp.status_code != 200:
            logging.error('get page failed by url %s' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        return page

    def crack_checkcode(self):
        """破解验证码"""
        ckcode = ('', '')
        checkcode_url = self.get_checkcode_url()
        if checkcode_url == None:
            return ckcode
        resp = self.crawl_page_by_url(checkcode_url)
        if resp.status_code != 200:
            logging.error('failed to get checkcode img')
            return ckcode
        page = resp.content
        time.sleep(random.uniform(1, 2))
        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            logging.error('invalid code cracker\n')
            return ckcode
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode

    def generate_time_stamp(self):
        """生成时间戳
        """
        return int(time.time())
Пример #18
0
class HeilongjiangClawer(Crawler):
    """黑龙江工商公示信息网页爬虫
    """
    # html数据的存储路径
    html_restore_path = settings.json_restore_path + '/heilongjiang/'

    # 验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/heilongjiang/ckcode.jpg'
    code_cracker = CaptchaRecognition('heilongjiang')
    # 多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.hljaic.gov.cn',
        'get_checkcode':
        'http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0',
        'post_checkcode':
        'http://gsxt.hljaic.gov.cn/checkCheckNo.jspx',
        'get_info_entry':
        'http://gsxt.hljaic.gov.cn/searchList.jspx',
        'ind_comm_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=',
        'ent_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=',
        'other_dept_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/otherDepartment.jspx?id=',
        'judical_assist_skeleton':
        'http://gsxt.hljaic.gov.cn/justiceAssistance.jspx?id=',
        'ind_comm_pub_reg_shareholder':
        'http://gsxt.hljaic.gov.cn/QueryInvList.jspx?',  # 股东信息
        'ind_comm_pub_reg_modify':
        'http://gsxt.hljaic.gov.cn/QueryAltList.jspx?',  # 变更信息翻页
        'ind_comm_pub_arch_key_persons':
        'http://gsxt.hljaic.gov.cn/QueryMemList.jspx?',  # 主要人员信息翻页
        'ind_comm_pub_spot_check':
        'http://gsxt.hljaic.gov.cn/QuerySpotCheckList.jspx?',  # 抽样检查信息翻页
        'ind_comm_pub_movable_property_reg':
        'http://gsxt.hljaic.gov.cn/QueryMortList.jspx?',  # 动产抵押登记信息翻页
        'ind_comm_pub_business_exception':
        'http://gsxt.hljaic.gov.cn/QueryExcList.jspx?',  # 经营异常信息
        'shareholder_detail':
        'http://gsxt.hljaic.gov.cn/queryInvDetailAction.jspx?id=',  # 投资人详情
        'movable_property_reg_detail':
        'http://gsxt.hljaic.gov.cn/mortInfoDetail.jspx?id=',  # 动产抵押登记详情
        'annual_report':
        'http://gsxt.hljaic.gov.cn/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        self.json_restore_path = json_restore_path
        self.parser = HeilongjiangParser(self)
        self.img_count = 1
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

    def run(self, ent_number=0):
        """爬取的主函数
        """
        return Crawler.run(self, ent_number)
        # return super(HeilongjiangClawer, self).run(ent_number)

    def crawl_check_page(self):
        """爬取验证码页面,包括下载验证码图片以及破解验证码
        :return true or false
        """
        count = 0
        while count < 10:
            ck_code = self.crack_check_code()

            data = {'checkNo': ck_code}
            resp = self.reqst.post(self.urls['post_checkcode'], data=data)

            if resp.status_code != 200:
                logging.error("crawl post check page failed!")
                count += 1
                continue
            if resp.content[10] == 't':
                data = {'checkNo': ck_code, 'entName': self.ent_number}
                resp = self.reqst.post(self.urls['get_info_entry'], data=data)
                soup = BeautifulSoup(resp.text, "html5lib")
                div = soup.find("div", {"style": "height:500px;"})
                a = div.find("a")
                if a:
                    company_id = a["href"].split('?')[1]
                    self.company_id = company_id.split("=")[1]
                    return True
                else:
                    return False
            else:
                logging.error("crawl post check page failed!")
                count += 1
                continue
        return False

    def crack_check_code(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(self.urls['get_checkcode'])
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            return None

        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()

        return ckcode[1]

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息
        """
        url = "%s%s" % (self.urls['ind_comm_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get ind_comm_pub_skeleton')
        self.parser.parse_ind_comm_pub_pages(resp.content)

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息
        """
        url = "%s%s" % (self.urls['ent_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get ent_pub_skeleton')
        self.parser.parse_ent_pub_pages(resp.content)

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息
        """
        url = "%s%s" % (self.urls['other_dept_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get other_dept_pub_skeleton')
        self.parser.crawl_other_dept_pub_pages(resp.content)

    def crawl_judical_assist_pub_pages(self):
        """爬取司法协助信息
        """
        url = "%s%s" % (self.urls['judical_assist_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get judical_assist_skeleton')
        self.parser.parse_judical_assist_pub_pages(resp.content)
Пример #19
0
class LiaoningCrawler(Crawler):
    """辽宁工商爬虫 , 集成Crawler基类 """
    code_cracker = CaptchaRecognition('liaoning')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'http://www.lngs.gov.cn/ecdomain/framework/lngs/index.jsp',
        'get_checkcode':
        'http://gsxt.lngs.gov.cn/saicpub/commonsSC/loginDC/securityCode.action?',
        'post_checkcode':
        'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/lngsSearchFpc.action',
    }

    def __init__(self, json_restore_path=None):
        super(LiaoningCrawler, self).__init__()
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/liaoning/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/liaoning/ckcode.jpg'
        self.parser = LiaoningParser(self)
        self.proxies = get_proxy('liaoning')
        self.timeout = (30, 20)

    def run(self, _ent):
        """爬取的主函数 """
        print self.__class__.__name__
        logging.error('crawl %s .', self.__class__.__name__)
        if self.proxies:
            print self.proxies
            self.reqst.proxies = self.proxies
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        return Crawler.run(self, _ent)

    def crawl_check_page(self):
        """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交
        """
        count = 0
        while count < 30:
            count += 1

            ckcode = self.crack_checkcode()
            logging.debug('crack code = %s, %s' % (ckcode[0], ckcode[1]))
            if ckcode == "":
                continue
            post_data = {
                'authCode': ckcode[1],
                'solrCondition': self._ent,
            }

            next_url = self.urls['post_checkcode']
            resp = self.reqst.post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to get crackcode image by url %s, fail count = %d'
                    % (next_url, count))
                continue
            m = re.search(r'codevalidator= \"(.*)\"', resp.content)

            status = m.group(1)
            if status == "fail":
                continue
            else:
                # m = re.search(r'searchList_paging\[(.*)\]', resp.content)
                # if m.group(1) != "null":
                #     js ='[' + m.group(1) +']'
                #     results = json.loads(js)
                #     for item in results:
                #         if item['entname'] == self._ent or item['regno']== self._ent:
                #             self.ents[self._ent] =

                # self.type = js["enttype"]
                # self.pripid = js['pripid']
                # self.entname = js["entname"]
                # self.optstate = js["optstate"]
                # self.regno = js["regno"]
                if self.analyze_showInfo(resp.content):
                    return True
                else:
                    logging.error(
                        'crack checkcode failed, total fail count = %d' %
                        (count))

            time.sleep(random.uniform(1, 4))
        return False

    def analyze_showInfo(self, page):
        """ 判断是否成功搜索页面
            分析 展示页面, 获得搜索到的企业列表
        """
        m = re.search(r'searchList_paging.*\[(.*)\]', page)
        if m:
            if m.group(1) != "null":
                js = '[' + m.group(1) + ']'
                results = json.loads(js)
                onclick = "openDetail('%s', '%s', '%s', '%s', '%s', '%s')"
                Ent = {}
                count = 0
                for item in results:
                    count += 1
                    if item['entname'] == self._ent or item[
                            'regno'] == self._ent:
                        Ent.clear()
                        Ent[item['regno']] = onclick % (
                            item['regno'], item['enttype'], item['pripid'],
                            item['entname'], item['optstate'], 'undefined')
                        break
                    else:
                        Ent[item['regno']] = onclick % (
                            item['regno'], item['enttype'], item['pripid'],
                            item['entname'], item['optstate'], 'undefined')
                    if count == 3:
                        break
                self.ents = Ent
                return True
            return False
        else:
            return False

    def crack_checkcode(self):
        """破解验证码"""
        checkcode_url = self.urls['get_checkcode']

        resp = self.reqst.get(checkcode_url, verify=False)
        if resp.status_code != 200:
            logging.warn('failed to get checkcode img')
            return
        page = resp.content

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            print 'invalid code cracker'
            return ''
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
            os.remove(self.ckcode_image_path)
            time.sleep(10)
        finally:
            pass
        self.write_file_mutex.release()
        f.close()
        return ckcode

    @exe_time
    def crawl_ind_comm_pub_pages(self, *args, **kwargs):
        """爬取工商公示信息
        """
        if not len(args): return
        url = args[0]
        results = re.findall('\'(.*?)\'', url)
        self.regno = results[0]
        self.type = results[1]
        self.pripid = results[2]
        self.entname = results[3]
        self.optstate = results[4]
        self.revdate = results[5]

        self.parser.parse_ind_comm_pub_pages()

    @exe_time
    def crawl_ent_pub_pages(self, *args, **kwargs):
        """爬取企业公示信息
        """

        self.parser.parse_ent_pub_pages()

    @exe_time
    def crawl_other_dept_pub_pages(self, *args, **kwargs):
        """爬取其他部门公示信息
        """

        self.parser.crawl_other_dept_pub_pages()

    @exe_time
    def crawl_judical_assist_pub_pages(self, *args, **kwargs):
        """爬取司法协助信息
        """

        self.parser.parse_judical_assist_pub_pages()
Пример #20
0
class LiaoningCrawler(Crawler):
    """辽宁工商爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/liaoning/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/liaoning/ckcode.jpg'
    code_cracker = CaptchaRecognition('liaoning')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'http://www.lngs.gov.cn/ecdomain/framework/lngs/index.jsp',
        'get_checkcode':
        'http://gsxt.lngs.gov.cn/saicpub/commonsSC/loginDC/securityCode.action?',
        'post_checkcode':
        'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/lngsSearchFpc.action',
    }

    def __init__(self, json_restore_path):
        self.json_restore_path = json_restore_path
        self.parser = LiaoningParser(self)
        self.img_count = 1
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

    def run(self, ent_number=0):
        """爬取的主函数
        """

        return Crawler.run(self, ent_number)

    def crawl_check_page(self):
        """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交
        """
        count = 0
        while count < 30:
            count += 1

            ckcode = self.crack_checkcode()
            logging.info('crack code = %s, %s' % (ckcode[0], ckcode[1]))
            if ckcode == "":
                continue
            post_data = {
                'authCode': ckcode[1],
                'solrCondition': self.ent_number,
            }

            next_url = self.urls['post_checkcode']
            resp = self.reqst.post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to get crackcode image by url %s, fail count = %d'
                    % (next_url, count))
                continue
            # logging.info('crack code = %s, %s, response =  %s' %(ckcode[0], ckcode[1], resp.content))
            m = re.search(r'codevalidator= \"(.*)\"', resp.content)

            status = m.group(1)
            if status == "fail":
                continue
            else:
                m = re.search(r'searchList_paging\(\[(.*)\]', resp.content)
                if m.group(1) != "null":

                    js = json.loads(m.group(1))
                    self.type = js["enttype"]
                    self.pripid = js['pripid']
                    self.entname = js["entname"]
                    self.optstate = js["optstate"]
                    self.regno = js["regno"]

                    return True
                else:
                    logging.error(
                        'crack checkcode failed, total fail count = %d' %
                        (count))

            time.sleep(random.uniform(2, 4))
        return False

    def crack_checkcode(self):
        """破解验证码"""
        checkcode_url = self.urls['get_checkcode']

        resp = self.reqst.get(checkcode_url, verify=False)
        if resp.status_code != 200:
            logging.warn('failed to get checkcode img')
            return
        page = resp.content

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            print 'invalid code cracker'
            return ''
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
            os.remove(self.ckcode_image_path)
            time.sleep(10)
        finally:
            pass
        self.write_file_mutex.release()
        f.close()
        return ckcode

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息
        """
        self.parser.parse_ind_comm_pub_pages()

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息
        """
        self.parser.parse_ent_pub_pages()

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息
        """
        self.parser.crawl_other_dept_pub_pages()

    def crawl_judical_assist_pub_pages(self):
        """爬取司法协助信息
        """
        self.parser.parse_judical_assist_pub_pages()
Пример #21
0
class JiangsuCrawler(Crawler):
    """江苏工商公示信息网页爬虫
    """
    code_cracker = CaptchaRecognition('jiangsu')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.jsgsj.gov.cn',
        'official_site':
        'http://www.jsgsj.gov.cn:58888/province/',
        'get_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7',
        'post_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?query_info=true',
        'ind_comm_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp',
        'ent_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp',
        'other_dept_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp',
        'judical_assist_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp',
        'annual_report_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp',
        'ci_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true',
        'common_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true',
        'nb_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true',
        'ci_detail':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true'
    }

    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
             需要在写入文件的时候加锁
        Returns:
        """
        super(JiangsuCrawler, self).__init__()
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/jiangsu/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/jiangsu/ckcode.jpg'

        # self.proxies = {}
        proxies = get_proxy('jiangsu')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.parser = JiangsuParser(self)

        self.corp_org = ''
        self.corp_id = ''
        self.corp_seq_id = ''
        self.common_enter_post_data = {}
        self.ci_enter_post_data = {}
        self.nb_enter_post_data = {}
        self.post_info = {
            'ind_comm_pub_reg_basic': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter',
                'specificQuery': 'basicInfo'
            },
            'ind_comm_pub_reg_shareholder': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'investmentInfor'
            },
            'ind_comm_pub_reg_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'biangeng'
            },
            'ind_comm_pub_arch_key_persons': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'personnelInformation'
            },
            'ind_comm_pub_arch_branch': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'branchOfficeInfor'
            },
            #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'},
            'ind_comm_pub_movable_property_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'dongchan'
            },
            'ind_comm_pub_equity_ownership_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'guquanchuzhi'
            },
            'ind_comm_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'chufa'
            },
            'ind_comm_pub_business_exception': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'abnormalInfor'
            },
            #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'},
            'ind_comm_pub_spot_check': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'checkup'
            },
            'ind_comm_pub_reg_shareholder_detail': {
                'url_type': 'ci_detail',
                'post_type': 'ci_detail',
                'specificQuery': 'investorInfor'
            },
            'ent_pub_annual_report': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_report_list'
            },
            'annual_report_detail': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter'
            },
            'ent_pub_shareholder_capital_contribution': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_tzcz'
            },
            'ent_pub_administrative_license': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzxk'
            },
            'ent_pub_knowledge_property': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_zscq'
            },
            'ent_pub_administration_sanction': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzcf'
            },
            'other_dept_pub_administration_license': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzheng'
            },
            'other_dept_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzhengchufa'
            },
            'judical_assist_pub_equity_freeze': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gqdjList'
            },
            'judical_assist_pub_shareholder_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gdbgList'
            }
        }

    def run(self, ent_number):
        # print self.__class__.__name__
        # logging.error('crawl %s.', self.__class__.__name__)
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        self.ent_number = ent_number
        return Crawler.run(self, self.ent_number)

    #分析 展示页面, 获得搜索到的企业列表
    def analyze_showInfo(self, page):
        Ent = {}
        soup = BeautifulSoup(page, "html5lib")
        dts = soup.find_all("dt")
        dds = soup.find_all('dd')
        if len(dts) != len(dds):
            return False
        count = 0
        for i in xrange(len(dts)):
            dt = dts[i]
            profile = dds[i]
            count += 1
            url = ""
            ent = ""
            name = ""
            link = dt.find('a')
            if link and link.has_attr('onclick'):
                url = link['onclick']
                name = link.get_text().strip()

            if profile and profile.span:
                ent = profile.span.get_text().strip()
            if name == self.ent_number:
                Ent.clear()
                Ent[ent] = url
                break
            if count == 3:
                break
            Ent[ent] = url
        self.ents = Ent
        return True

    def crawl_check_page(self):
        """爬取验证码页面,包括下载验证码图片以及破解验证码
        :return true or false
        """
        resp = self.request_by_method('GET',
                                      self.urls['official_site'],
                                      timeout=self.timeout)
        if not resp:
            logging.error("crawl the first page page failed!\n")
            return False
        count = 0
        while count < 15:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                logging.error("crawl checkcode failed! count number = %d\n" %
                              (count))
                continue
            data = {'name': self.ent_number, 'verifyCode': ckcode[1]}
            resp = self.request_by_method('POST',
                                          self.urls['post_checkcode'],
                                          data=data,
                                          timeout=self.timeout)
            # print resp
            results = json.loads(resp)[0]
            if results['INFO']:
                if self.analyze_showInfo(results['INFO']):
                    return True
            else:
                if results['COUNT'] == '>> 没有符合查询条件的结果 <<':
                    break
                logging.error(
                    "crawl post check page failed! count number = %d\n" %
                    (count))
            print "crawl post check page failed! count number = %d\n" % (count)
            time.sleep(random.uniform(3, 5))
        return False

    def get_page_data(self, page_name, real_post_data=None):
        """获取页面数据,通过页面名称,和post_data, 江苏的页面中几乎全部都是post方式来获取数据
        """
        url = self.urls[self.post_info[page_name].get('url_type')]
        logging.info('get %s, url:\n%s\n' % (page_name, url))
        if real_post_data:
            return self.get_pages(url, real_post_data)

        if self.post_info[page_name].get('post_type') == 'ci_enter':
            self.ci_enter_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_enter_post_data
        elif self.post_info[page_name].get(
                'post_type') == 'ci_enter_with_recordline':
            self.ci_enter_with_record_line_post_data[
                'specificQuery'] = self.post_info[page_name].get(
                    'specificQuery')
            post_data = self.ci_enter_with_record_line_post_data
        elif self.post_info[page_name].get('post_type') == 'common_enter':
            self.common_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.common_enter_post_data
        elif self.post_info[page_name].get('post_type') == 'ci_detail':
            self.ci_detail_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_detail_post_data
        elif self.post_info[page_name].get('post_type') == 'nb_enter':
            self.nb_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.nb_enter_post_data
        return self.get_pages(url, post_data)

    def crawl_ind_comm_pub_pages(self, url):
        """爬取工商公示信息
        """
        if not self.parse_post_check_page(url):
            return
        if not self.parser.ind_comm_pub_skeleton_built:
            page = self.crawl_skeleton_page('ind_comm_pub_skeleton')
            if not page:
                logging.error('crawl ind comm pub skeleton failed!')
                return False
            self.parser.parse_page('ind_comm_pub_skeleton', page)

        for item in (
                'ind_comm_pub_reg_basic',  # 登记信息-基本信息
                'ind_comm_pub_reg_shareholder',  # 股东信息
                'ind_comm_pub_reg_modify',
                'ind_comm_pub_arch_key_persons',  # 备案信息-主要人员信息
                'ind_comm_pub_arch_branch',  # 备案信息-分支机构信息
                #'ind_comm_pub_arch_liquidation', # 备案信息-清算信息, 网页中没有
                'ind_comm_pub_movable_property_reg',  # 动产抵押登记信息
                #'ind_comm_pub_equity_ownership_reg', # 股权出置登记信息
                'ind_comm_pub_administration_sanction',  # 行政处罚信息
                #'ind_comm_pub_business_exception',  # 经营异常信息 , 网页中不存在
                #'ind_comm_pub_serious_violate_law',  # 严重违法信息
                'ind_comm_pub_spot_check'):  # 抽查检查信息

            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_ent_pub_pages(self, url):
        """爬取企业公示信息
        """
        if not self.parser.ent_pub_skeleton_built:
            page = self.crawl_skeleton_page('ent_pub_skeleton')
            if not page:
                logging.error('crawl ent pub skeleton failed!')
                return False
            self.parser.parse_page('ent_pub_skeleton', page)

        if not self.parser.annual_report_skeleton_built:
            page = self.crawl_skeleton_page('annual_report_skeleton')
            if not page:
                logging.error('crawl annual report skeleton failed!')
                return False
            self.parser.parse_page('annual_report_skeleton', page)

        for item in (
                'ent_pub_annual_report',
                #'ent_pub_shareholder_capital_contribution', #企业投资人出资比例
                #'ent_pub_equity_change', #股权变更信息
                'ent_pub_administrative_license',  #行政许可信息
                'ent_pub_knowledge_property',  #知识产权出资登记
                #'ent_pub_administration_sanction' #行政许可信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_other_dept_pub_pages(self, url):
        """爬取其他部门公示信息
        """
        if not self.parser.other_dept_pub_skeleton_built:
            page = self.crawl_skeleton_page('other_dept_pub_skeleton')
            if not page:
                logging.error('crawl other dept pub skeleton failed!')
                return False
            self.parser.parse_page('other_dept_pub_skeleton', page)

        for item in (
                'other_dept_pub_administration_license',  #行政许可信息
                'other_dept_pub_administration_sanction'  #行政处罚信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_judical_assist_pub_pages(self, url):
        """爬取司法协助信息
        """
        if not self.parser.judical_assist_pub_skeleton_built:
            page = self.crawl_skeleton_page('judical_assist_pub_skeleton')
            if not page:
                logging.error('crawl judical assist skeleton failed!')
                return False
            self.parser.parse_page('judical_assist_pub_skeleton', page)

        for item in (
                'judical_assist_pub_equity_freeze',  #股权冻结信息
                'judical_assist_pub_shareholder_modify'  #股东变更信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def get_pages(self, url, post_data):
        """获取网页数据
        Args:
            url: url地址
            post_data: post方式获取数据,返回的如果是一个列表,则将列表的所有元素都获得才返回
        Returns:
        """
        resp = self.request_by_method('POST',
                                      url,
                                      data=post_data,
                                      timeout=self.timeout)
        if not resp:
            logging.error('get all pages of a section failed!')
            return
        else:
            json_obj = json.loads(resp)
            if type(json_obj) == dict and json_obj.get(
                    'total', None) and int(json_obj.get('total')) > 5:
                post_data['pageSize'] = json_obj.get('total')
                resp = self.request_by_method('POST',
                                              url,
                                              data=post_data,
                                              timeout=self.timeout)
                if not resp:
                    logging.error('get all pages of a section failed!')
                    return
        return resp

    def crawl_skeleton_page(self, name):
        """爬取网页表格的框架页面,在江苏的网页中, 工商公示信息, 企业公示信息,其他部门公示信息,司法协助信息
        所有的tab页面中的表格结构都在一个最开始的页面中给出
        """
        url = self.urls[name]
        post_data = {
            'org': self.corp_org,
            'id': self.corp_id,
            'seq_id': self.corp_seq_id,
            'reg_no': self.ent_number,
            'name': self.ent_number,
            'containContextPath': 'ecipplatform',
            'corp_name': self.ent_number
        }
        resp = self.request_by_method('POST',
                                      url,
                                      data=post_data,
                                      timeout=self.timeout)
        if not resp:
            logging.error('crawl %s page failed, error code.\n' % (name))
            return False
        return resp

    def parse_post_check_page(self, link):
        """解析提交验证码之后的页面,提取所需要的信息,比如corp id等
        Args:
            page: 提交验证码之后的页面
        """
        m = re.findall(r"\'.*?\'", link)
        # queryInfor('/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp','876','126075','69','320000000000192','','ecipplatform')
        if m:
            self.corp_org = m[1]
            self.corp_id = m[2]
            self.corp_seq_id = m[3]
            self.common_enter_post_data = {
                'showRecordLine': '1',
                'specificQuery': 'commonQuery',
                'propertiesName': '',
                'corp_org': self.corp_org,
                'corp_id': self.corp_id,
                'pageNo': '1',
                'pageSize': '5'
            }
            self.ci_enter_post_data = {
                'org': self.corp_org,
                'id': self.corp_id,
                'seq_id': self.corp_seq_id,
                'specificQuery': ''
            }
            self.ci_enter_with_record_line_post_data = {
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'CORP_SEQ_ID': self.corp_seq_id,
                'specificQuery': '',
                'pageNo': '1',
                'pageSize': '5',
                'showRecordLine': '1'
            }
            self.ci_detail_post_data = {
                'ORG': self.corp_org,
                'ID': '',
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'SEQ_ID': '',
                'REG_NO': self.ent_number,
                'specificQuery': ''
            }
            self.nb_enter_post_data = {
                'ID': '',
                'REG_NO': self.ent_number,
                'showRecordLine': '0',
                'specificQuery': 'gs_pb',
                'propertiesName': '',
                'pageNo': '1',
                'pageSize': '5',
                'ADMIT_MAIN': '08'
            }
            return True
        return False

    def crack_checkcode(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.request_by_method('GET',
                                      self.urls['get_checkcode'],
                                      timeout=self.timeout)
        if not resp:
            logging.error('Failed, exception occured when getting checkcode')
            return ('', '')
        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        ckcode = ('', '')
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
        self.write_file_mutex.release()
        return ckcode

    def request_by_method(self, method, url, *args, **kwargs):
        r = None
        try:
            r = self.reqst.request(method, url, *args, **kwargs)
        except requests.exceptions.Timeout as err:
            logging.error(u'Getting url: %s timeout. %s .' %
                          (url, err.message))
            return False
        except requests.exceptions.ConnectionError:
            logging.error(u"Getting url:%s connection error ." % (url))
            return False
        except Exception as err:
            logging.error(u'Getting url: %s exception:%s . %s .' %
                          (url, type(err), err.message))
            return False
        if r.status_code != 200:
            logging.error(
                u"Something wrong when getting url:%s , status_code=%d", url,
                r.status_code)
            return False
        return r.content

    def get_annual_report_detail(self, report_year, report_id):
        """获取企业年报的详细信息
        """
        annual_report_detail = {}
        post_data = self.nb_enter_post_data
        post_data['ID'] = report_id
        post_data['showRecordLine'] = '0'
        post_data['OPERATE_TYPE'] = '2'
        post_data['propertiesName'] = 'query_basicInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'企业基本信息'] = self.parser.parse_page(
            'annual_report_ent_basic_info', page_data)
        annual_report_detail[u'企业资产状况信息'] = self.parser.parse_page(
            'annual_report_ent_property_info', page_data)

        post_data['showRecordLine'] = '1'
        post_data['propertiesName'] = 'query_websiteInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'网站或网店信息'] = self.parser.parse_page(
            'annual_report_web_info', page_data)

        post_data['propertiesName'] = 'query_investInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外投资信息'] = self.parser.parse_page(
            'annual_report_investment_abord_info', page_data)

        post_data['MAIN_ID'] = report_id
        post_data['OPERATE_TYPE'] = '1'
        post_data['TYPE'] = 'NZGS'
        post_data['ADMIT_MAIN'] = '08'
        post_data['propertiesName'] = 'query_stockInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'股东及出资信息'] = self.parser.parse_page(
            'annual_report_shareholder_info', page_data)

        post_data['propertiesName'] = 'query_InformationSecurity'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外提供保证担保信息'] = self.parser.parse_page(
            'annual_report_external_guarantee_info', page_data)

        post_data['propertiesName'] = 'query_RevisionRecord'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'修改记录'] = self.parser.parse_page(
            'annual_report_modify_record', page_data)
        return annual_report_detail
Пример #22
0
    def __init__(self, json_restore_path):
        # self.cur_time = str(int(time.time()*1000))
        self.id = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = settings.json_restore_path + '/guangxi/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guangxi')

        self.result_json_dict = {}
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })

        self.mydict = {
            'eareName':
            'http://gxqyxygs.gov.cn',
            'search':
            'http://gxqyxygs.gov.cn/search.jspx',
            'searchList':
            'http://gxqyxygs.gov.cn/searchList.jspx',
            'validateCode':
            'http://gxqyxygs.gov.cn/validateCode.jspx?type=0&id=0.6145392225593206'
        }

        self.search_dict = {
            'eareName': 'http://gxqyxygs.gov.cn',
            'search': 'http://222.143.24.157/search.jspx',
            'validateCode':
            'http://222.143.24.157/validateCode.jspx?type=0&id=0.8720359673599201',
            'searchList': 'http://222.143.24.157/searchList.jspx',
            'businessPublicity':
            'http://222.143.24.157/businessPublicity.jspx?',
            'enterprisePublicity':
            'http://222.143.24.157/enterprisePublicity.jspx?',
            'otherDepartment': 'http://222.143.24.157/otherDepartment.jspx?',
            'justiceAssistance':
            'http://222.143.24.157/justiceAssistance.jspx?',
            'next_head': 'http://gxqyxygs.gov.cn/Query'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}
Пример #23
0
    def __init__(self, *args, **kwargs):
        #html数据的存储路径
        self.html_restore_path = settings.json_restore_path + '/beijing/'
        #验证码图片的存储路径
        self.ckcode_image_path = settings.json_restore_path + '/beijing/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('beijing')
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        #多线程爬取时往最后的json文件中写时的加锁保护
        self.write_file_mutex = threading.Lock()
        # self.json_restore_path = settings.json_restore_path
        self.credit_ticket = None
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        self.timeout = 20
        self.ent_id = ''

        self.urls = {
            'host':
            'http://qyxy.baic.gov.cn',
            'official_site':
            'http://qyxy.baic.gov.cn/beijing',
            'get_checkcode':
            'http://qyxy.baic.gov.cn',
            'post_checkcode':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!checkCode.dhtml',
            'open_info_entry':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!getBjQyList.dhtml',
            'ind_comm_pub_reg_basic':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!openEntInfo.dhtml?',
            'ind_comm_pub_reg_shareholder':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!tzrFrame.dhtml?',
            'ind_comm_pub_reg_modify':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!biangengFrame.dhtml?',
            'ind_comm_pub_arch_key_persons':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!zyryFrame.dhtml?',
            'ind_comm_pub_arch_branch':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!fzjgFrame.dhtml?',
            'ind_comm_pub_arch_liquidation':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!qsxxFrame.dhtml?',
            'ind_comm_pub_movable_property_reg':
            'http://qyxy.baic.gov.cn/gjjbjTab/gjjTabQueryCreditAction!dcdyFrame.dhtml?',
            'ind_comm_pub_equity_ownership_reg':
            'http://qyxy.baic.gov.cn/gdczdj/gdczdjAction!gdczdjFrame.dhtml?',
            'ind_comm_pub_administration_sanction':
            'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list.dhtml?',
            'ind_comm_pub_business_exception':
            'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_jyycxx.dhtml?',
            'ind_comm_pub_serious_violate_law':
            'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_yzwfxx.dhtml?',
            'ind_comm_pub_spot_check':
            'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_ccjcxx.dhtml?',
            'ent_pub_ent_annual_report':
            'http://qyxy.baic.gov.cn/qynb/entinfoAction!qyxx.dhtml?',
            'ent_pub_shareholder_capital_contribution':
            'http://qyxy.baic.gov.cn/gdcz/gdczAction!list_index.dhtml?',
            'ent_pub_equity_change':
            'http://qyxy.baic.gov.cn/gdgq/gdgqAction!gdgqzrxxFrame.dhtml?',
            'ent_pub_administration_license':
            'http://qyxy.baic.gov.cn/xzxk/xzxkAction!list_index.dhtml?',
            'ent_pub_knowledge_property':
            'http://qyxy.baic.gov.cn/zscqczdj/zscqczdjAction!list_index.dhtml?',
            'ent_pub_administration_sanction':
            'http://qyxy.baic.gov.cn/gdgq/gdgqAction!qyxzcfFrame.dhtml?',
            'other_dept_pub_administration_license':
            'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzxk.dhtml?',
            'other_dept_pub_administration_sanction':
            'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzcf.dhtml?',
            'shareholder_detail':
            'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!touzirenInfo.dhtml?',
            'annual_report_detail':
            'http://qyxy.baic.gov.cn/entPub/entPubAction!gdcz_bj.dhtml',
            'annual_report_detail_for_fro':
            'http://qyxy.baic.gov.cn/entPub/entPubAction!qydwdb_bj.dhtml',
            'annual_report_detail_change':
            'http://qyxy.baic.gov.cn/entPub/entPubAction!qybg_bj.dhtml',
        }
Пример #24
0
class ZongjuCrawler(Crawler):
    """总局工商爬虫
    """

    code_cracker = CaptchaRecognition('zongju')
    # 多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {'host': 'http://qyxy.saic.gov.cn',
            'official_site': 'http://gsxt.saic.gov.cn/zjgs/',
            'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=',
            'post_checkcode':
            'http://gsxt.saic.gov.cn/zjgs/security/verify_captcha',
            'get_info_entry':
            'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',    # 获得企业入口
            'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?',
    # 获得企业信息页面的url,通过指定不同的tab=1-4来选择不同的内容(工商公示,企业公示...)
            }

    def __init__(self, json_restore_path=None):
        super(ZongjuCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/zongju/'
        # 验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/zongju/ckcode.jpg'

        self.parser = ZongjuParser(self)
        self.proxies = get_proxy('beijing')
        self.timeout = (30, 20)

    def run(self, _ent):
        """爬取的主函数
        """

        # self.proxies = {'http':'http://123.121.30.123:8118'}
        if self.proxies:
            print self.proxies
            self.reqst.proxies = self.proxies
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        return Crawler.run(self, _ent)

    def crawl_check_page(self):
        """爬取验证码页面,包括获取验证码url,下载验证码图片,破解验证码并提交
        """
        count = 0
        next_url = self.urls['official_site']
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to get official site')
            return False
        if not self.parse_pre_check_page(resp.content):
            logging.error('failed to parse pre check page')
            return False

        while count < 30:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                continue
            post_data = {'captcha': ckcode[1], 'session.token': self.session_token}
            next_url = self.urls['post_checkcode']
            resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout, verify=False)
            if resp.status_code != 200:
                logging.error('failed to get crackcode image by url %s, fail count = %d' % (next_url, count))
                continue

            logging.error('crack code = %s, %s, response =  %s' % (ckcode[0], ckcode[1], resp.content))
            if resp.content == '0':
                logging.error('crack checkcode failed!count = %d' % (count))
                continue

            next_url = self.urls['get_info_entry']
            post_data = {
                'searchType': '1',
                'captcha': ckcode[1],
                'session.token': self.session_token,
                'condition.keyword': self._ent
            }

            resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout)

            if resp.status_code != 200:
                logging.error('faild to crawl url %s' % next_url)
                return False

            if self.parse_post_check_page(resp.content):
                return True
            logging.error('crack checkcode failed, total fail count = %d' % count)
            print('crack checkcode failed!count = %d' % (count))
            time.sleep(random.uniform(1, 3))

        return False

    @exe_time
    def crawl_ind_comm_pub_pages(self, *args, **kwargs):
        """爬取工商公示信息页面
        在总局的网站中,工商公示信息在一个页面中返回。页面中包含了多个表格,调用 Parser的 parse_ind_comm_page进行解析
        在 Parser的ind_comm_pub_page 中,访问并设置 crawler中的 json_dict。
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=01'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get ind comm pub info failed!')
            return False
        self.parser.parse_ind_comm_pub_pages(resp.content)

    @exe_time
    def crawl_ent_pub_pages(self, *args, **kwargs):
        """爬取企业公示信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=02'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get ent pub info failed!')
            return False
        self.parser.parse_ent_pub_pages(resp.content)

    @exe_time
    def crawl_other_dept_pub_pages(self, *args, **kwargs):
        """爬取其他部门公示信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=03'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get other dept pub info failed!')
            return False
        self.parser.parse_other_dept_pub_pages(resp.content)

    @exe_time
    def crawl_judical_assist_pub_pages(self, *args, **kwargs):
        """爬取司法协助信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=06'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get judical assist info failed!')
            return False
        self.parser.parse_judical_assist_pub_pages(resp.content)

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面,获取必要的信息
        """
        soup = BeautifulSoup(page, 'html5lib')
        divs = soup.find_all('div', attrs={'class': 'list-item'})

        if divs:
            Ent = {}
            count = 0
            for div in divs:
                count += 1
                link = div.find('div', attrs={'class': 'link'})
                profile = div.find('div', attrs={'class': 'profile'})
                url = ""
                ent = ""
                if link and link.find('a') and link.find('a').has_attr('href'):
                    url = link.find('a')['href']
                if profile and profile.span:
                    ent = profile.span.get_text().strip()
                name = link.find('a').get_text().strip()
                if name == self._ent:
                    Ent.clear()
                    Ent[ent] = url
                    break
                if count == 3:
                    break
                Ent[ent] = url
            self.ents = Ent
            return True
        else:
            return False

        # div_tag = soup.find('div', attrs={'class': 'link'})
        # if not div_tag:
        #     return False
        # open_info_url = div_tag.find('a').get('href')
        # m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', open_info_url)
        # if m:
        #     self.uuid = m.group(1)
        #     return True
        # else:
        #     return False

    def parse_pre_check_page(self, page):
        """解析提交验证码之前的页面
        """
        soup = BeautifulSoup(page, 'html.parser')
        input_tag = soup.find('input', attrs={'type': 'hidden', 'name': 'session.token'})
        if input_tag:
            self.session_token = input_tag.get('value')
            return True
        return False

    def crawl_page_by_url(self, url):
        """通过url直接获取页面
        """
        resp = self.reqst.get(url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to crawl page by url' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        # if saveingtml:
        #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
        return page

    def crack_checkcode(self):
        """破解验证码"""
        checkcode_url = self.urls['get_checkcode'] + '&ra=' + str(random.random())
        ckcode = ('', '')
        resp = self.reqst.get(checkcode_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to get checkcode img')
            return ckcode
        page = resp.content

        time.sleep(random.uniform(1, 2))

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            logging.error('invalid code cracker with ckcode= None')
            return ckcode
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
            os.remove(self.ckcode_image_path)
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode
Пример #25
0
class JiangsuCrawler(Crawler):
    """江苏工商公示信息网页爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/jiangsu/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg'
    code_cracker = CaptchaRecognition('jiangsu')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.jsgsj.gov.cn',
        'official_site':
        'http://www.jsgsj.gov.cn:58888/province/',
        'get_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7',
        'post_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true',
        'ind_comm_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp',
        'ent_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp',
        'other_dept_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp',
        'judical_assist_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp',
        'annual_report_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp',
        'ci_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true',
        'common_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true',
        'nb_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true',
        'ci_detail':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true'
    }

    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
             需要在写入文件的时候加锁
        Returns:
        """
        self.proxies = Proxies().get_proxies()
        self.json_restore_path = json_restore_path

        self.parser = JiangsuParser(self)
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })
        self.corp_org = ''
        self.corp_id = ''
        self.corp_seq_id = ''
        self.common_enter_post_data = {}
        self.ci_enter_post_data = {}
        self.nb_enter_post_data = {}
        self.post_info = {
            'ind_comm_pub_reg_basic': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter',
                'specificQuery': 'basicInfo'
            },
            'ind_comm_pub_reg_shareholder': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'investmentInfor'
            },
            'ind_comm_pub_reg_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'biangeng'
            },
            'ind_comm_pub_arch_key_persons': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'personnelInformation'
            },
            'ind_comm_pub_arch_branch': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'branchOfficeInfor'
            },
            #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'},
            'ind_comm_pub_movable_property_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'dongchan'
            },
            'ind_comm_pub_equity_ownership_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'guquanchuzhi'
            },
            'ind_comm_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'chufa'
            },
            'ind_comm_pub_business_exception': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'abnormalInfor'
            },
            #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'},
            'ind_comm_pub_spot_check': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'checkup'
            },
            'ind_comm_pub_reg_shareholder_detail': {
                'url_type': 'ci_detail',
                'post_type': 'ci_detail',
                'specificQuery': 'investorInfor'
            },
            'ent_pub_annual_report': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_report_list'
            },
            'annual_report_detail': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter'
            },
            'ent_pub_shareholder_capital_contribution': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_tzcz'
            },
            'ent_pub_administrative_license': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzxk'
            },
            'ent_pub_knowledge_property': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_zscq'
            },
            'ent_pub_administration_sanction': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzcf'
            },
            'other_dept_pub_administration_license': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzheng'
            },
            'other_dept_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzhengchufa'
            },
            'judical_assist_pub_equity_freeze': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gqdjList'
            },
            'judical_assist_pub_shareholder_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gdbgList'
            }
        }

    def run(self, ent_number=0):
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        return Crawler.run(self, ent_number)
        '''
        self.ent_number = str(ent_number)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.json_dict = {}

        if not self.crawl_check_page():
            settings.logger.error('crack check code failed, stop to crawl enterprise %s' % self.ent_number)
            return False

        self.crawl_ind_comm_pub_pages()
        self.crawl_ent_pub_pages()
        self.crawl_other_dept_pub_pages()
        self.crawl_judical_assist_pub_pub_pages()

        #采用多线程,在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.json_dict})
        self.write_file_mutex.release()
        return True
        '''

    def crawl_check_page(self):
        """爬取验证码页面,包括下载验证码图片以及破解验证码
        :return true or false
        """
        resp = self.crawl_page_by_url(self.urls['official_site'])
        if not resp:
            logging.error("crawl the first page page failed!\n")
            return False
        count = 0
        while count < 15:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                logging.error("crawl checkcode failed! count number = %d\n" %
                              (count))
                continue
            data = {'name': self.ent_number, 'verifyCode': ckcode[1]}
            resp = self.crawl_page_by_url_post(self.urls['post_checkcode'],
                                               data=data)

            if resp.find("onclick") >= 0 and self.parse_post_check_page(resp):
                return True
            else:
                logging.error(
                    "crawl post check page failed! count number = %d\n" %
                    (count))
            time.sleep(random.uniform(5, 8))
        return False

    def get_page_data(self, page_name, real_post_data=None):
        """获取页面数据,通过页面名称,和post_data, 江苏的页面中几乎全部都是post方式来获取数据
        """
        url = self.urls[self.post_info[page_name].get('url_type')]
        logging.info('get %s, url:\n%s\n' % (page_name, url))
        if real_post_data:
            return self.get_pages(url, real_post_data)

        if self.post_info[page_name].get('post_type') == 'ci_enter':
            self.ci_enter_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_enter_post_data
        elif self.post_info[page_name].get(
                'post_type') == 'ci_enter_with_recordline':
            self.ci_enter_with_record_line_post_data[
                'specificQuery'] = self.post_info[page_name].get(
                    'specificQuery')
            post_data = self.ci_enter_with_record_line_post_data
        elif self.post_info[page_name].get('post_type') == 'common_enter':
            self.common_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.common_enter_post_data
        elif self.post_info[page_name].get('post_type') == 'ci_detail':
            self.ci_detail_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_detail_post_data
        elif self.post_info[page_name].get('post_type') == 'nb_enter':
            self.nb_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.nb_enter_post_data
        return self.get_pages(url, post_data)

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息
        """
        if not self.parser.ind_comm_pub_skeleton_built:
            page = self.crawl_skeleton_page('ind_comm_pub_skeleton')
            if not page:
                logging.error('crawl ind comm pub skeleton failed!')
                return False
            self.parser.parse_page('ind_comm_pub_skeleton', page)

        for item in (
                'ind_comm_pub_reg_basic',  # 登记信息-基本信息
                'ind_comm_pub_reg_shareholder',  # 股东信息
                'ind_comm_pub_reg_modify',
                'ind_comm_pub_arch_key_persons',  # 备案信息-主要人员信息
                'ind_comm_pub_arch_branch',  # 备案信息-分支机构信息
                #'ind_comm_pub_arch_liquidation', # 备案信息-清算信息, 网页中没有
                'ind_comm_pub_movable_property_reg',  # 动产抵押登记信息
                #'ind_comm_pub_equity_ownership_reg', # 股权出置登记信息
                'ind_comm_pub_administration_sanction',  # 行政处罚信息
                #'ind_comm_pub_business_exception',  # 经营异常信息 , 网页中不存在
                #'ind_comm_pub_serious_violate_law',  # 严重违法信息
                'ind_comm_pub_spot_check'):  # 抽查检查信息

            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息
        """
        if not self.parser.ent_pub_skeleton_built:
            page = self.crawl_skeleton_page('ent_pub_skeleton')
            if not page:
                logging.error('crawl ent pub skeleton failed!')
                return False
            self.parser.parse_page('ent_pub_skeleton', page)

        if not self.parser.annual_report_skeleton_built:
            page = self.crawl_skeleton_page('annual_report_skeleton')
            if not page:
                logging.error('crawl annual report skeleton failed!')
                return False
            self.parser.parse_page('annual_report_skeleton', page)

        for item in (
                'ent_pub_annual_report',
                #'ent_pub_shareholder_capital_contribution', #企业投资人出资比例
                #'ent_pub_equity_change', #股权变更信息
                'ent_pub_administrative_license',  #行政许可信息
                'ent_pub_knowledge_property',  #知识产权出资登记
                #'ent_pub_administration_sanction' #行政许可信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息
        """
        if not self.parser.other_dept_pub_skeleton_built:
            page = self.crawl_skeleton_page('other_dept_pub_skeleton')
            if not page:
                logging.error('crawl other dept pub skeleton failed!')
                return False
            self.parser.parse_page('other_dept_pub_skeleton', page)

        for item in (
                'other_dept_pub_administration_license',  #行政许可信息
                'other_dept_pub_administration_sanction'  #行政处罚信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_judical_assist_pub_pub_pages(self):
        """爬取司法协助信息
        """
        if not self.parser.judical_assist_pub_skeleton_built:
            page = self.crawl_skeleton_page('judical_assist_pub_skeleton')
            if not page:
                logging.error('crawl judical assist skeleton failed!')
                return False
            self.parser.parse_page('judical_assist_pub_skeleton', page)

        for item in (
                'judical_assist_pub_equity_freeze',  #股权冻结信息
                'judical_assist_pub_shareholder_modify'  #股东变更信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def get_pages(self, url, post_data):
        """获取网页数据
        Args:
            url: url地址
            post_data: post方式获取数据,返回的如果是一个列表,则将列表的所有元素都获得才返回
        Returns:
        """
        resp = self.crawl_page_by_url_post(url, data=post_data)
        if not resp:
            logging.error('get all pages of a section failed!')
            return
        else:
            json_obj = json.loads(resp)
            if type(json_obj) == dict and json_obj.get(
                    'total', None) and int(json_obj.get('total')) > 5:
                post_data['pageSize'] = json_obj.get('total')
                resp = self.crawl_page_by_url_post(url, data=post_data)
                if not resp:
                    logging.error('get all pages of a section failed!')
                    return
        return resp

    def crawl_skeleton_page(self, name):
        """爬取网页表格的框架页面,在江苏的网页中, 工商公示信息, 企业公示信息,其他部门公示信息,司法协助信息
        所有的tab页面中的表格结构都在一个最开始的页面中给出
        """
        url = self.urls[name]
        post_data = {
            'org': self.corp_org,
            'id': self.corp_id,
            'seq_id': self.corp_seq_id,
            'reg_no': self.ent_number,
            'name': self.ent_number,
            'containContextPath': 'ecipplatform',
            'corp_name': self.ent_number
        }
        resp = self.crawl_page_by_url_post(url, data=post_data)
        if not resp:
            logging.error('crawl %s page failed, error code.\n' % (name))
            return False
        return resp

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面,提取所需要的信息,比如corp id等
        Args:
            page: 提交验证码之后的页面
        """
        m = re.search(
            r'onclick=\\\"\w+\(\'([\w\./]+)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\'\)',
            page)
        if m:
            self.corp_org = m.group(2)
            self.corp_id = m.group(3)
            self.corp_seq_id = m.group(4)
            self.common_enter_post_data = {
                'showRecordLine': '1',
                'specificQuery': 'commonQuery',
                'propertiesName': '',
                'corp_org': self.corp_org,
                'corp_id': self.corp_id,
                'pageNo': '1',
                'pageSize': '5'
            }
            self.ci_enter_post_data = {
                'org': self.corp_org,
                'id': self.corp_id,
                'seq_id': self.corp_seq_id,
                'specificQuery': ''
            }
            self.ci_enter_with_record_line_post_data = {
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'CORP_SEQ_ID': self.corp_seq_id,
                'specificQuery': '',
                'pageNo': '1',
                'pageSize': '5',
                'showRecordLine': '1'
            }
            self.ci_detail_post_data = {
                'ORG': self.corp_org,
                'ID': '',
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'SEQ_ID': '',
                'REG_NO': self.ent_number,
                'specificQuery': ''
            }
            self.nb_enter_post_data = {
                'ID': '',
                'REG_NO': self.ent_number,
                'showRecordLine': '0',
                'specificQuery': 'gs_pb',
                'propertiesName': '',
                'pageNo': '1',
                'pageSize': '5',
                'ADMIT_MAIN': '08'
            }
            return True
        return False

    def crack_checkcode(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.crawl_page_by_url(self.urls['get_checkcode'])
        if not resp:
            logging.error('Failed, exception occured when getting checkcode')
            return ('', '')
        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        ckcode = ('', '')
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode

    def crawl_page_by_url(self, url):
        """根据url直接爬取页面
        """
        try:
            resp = self.reqst.get(url, proxies=self.proxies)
            if resp.status_code != 200:
                logging.error('crawl page by url failed! url = %s' % url)
            page = resp.content
            time.sleep(random.uniform(0.2, 1))
            # if saveingtml:
            #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
            return page
        except Exception as e:
            logging.error("crawl page by url exception %s" % (type(e)))

        return None

    def crawl_page_by_url_post(self, url, data):
        """ 根据url和post数据爬取页面
        """
        r = self.reqst.post(url, data, proxies=self.proxies)
        time.sleep(random.uniform(0.2, 1))
        if r.status_code != 200:
            logging.error(
                u"Getting page by url with post:%s\n, return status %s\n" %
                (url, r.status_code))
            return False
        return r.content

    def get_annual_report_detail(self, report_year, report_id):
        """获取企业年报的详细信息
        """
        annual_report_detail = {}
        post_data = self.nb_enter_post_data
        post_data['ID'] = report_id
        post_data['showRecordLine'] = '0'
        post_data['OPERATE_TYPE'] = '2'
        post_data['propertiesName'] = 'query_basicInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'企业基本信息'] = self.parser.parse_page(
            'annual_report_ent_basic_info', page_data)
        annual_report_detail[u'企业资产状况信息'] = self.parser.parse_page(
            'annual_report_ent_property_info', page_data)

        post_data['showRecordLine'] = '1'
        post_data['propertiesName'] = 'query_websiteInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'网站或网店信息'] = self.parser.parse_page(
            'annual_report_web_info', page_data)

        post_data['propertiesName'] = 'query_investInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外投资信息'] = self.parser.parse_page(
            'annual_report_investment_abord_info', page_data)

        post_data['MAIN_ID'] = report_id
        post_data['OPERATE_TYPE'] = '1'
        post_data['TYPE'] = 'NZGS'
        post_data['ADMIT_MAIN'] = '08'
        post_data['propertiesName'] = 'query_stockInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'股东及出资信息'] = self.parser.parse_page(
            'annual_report_shareholder_info', page_data)

        post_data['propertiesName'] = 'query_InformationSecurity'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外提供保证担保信息'] = self.parser.parse_page(
            'annual_report_external_guarantee_info', page_data)

        post_data['propertiesName'] = 'query_RevisionRecord'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'修改记录'] = self.parser.parse_page(
            'annual_report_modify_record', page_data)
        return annual_report_detail
Пример #26
0
    def __init__(self, json_restore_path):
        self.id = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg'
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        self.result_json_dict = {}
        self.code_cracker = CaptchaRecognition('yunnan')
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })

        useproxy = UseProxy()
        is_use_proxy = useproxy.get_province_is_use_proxy(province='guangxi')
        if not is_use_proxy:
            self.proxies = []
        else:
            proxy = Proxy()
            self.proxies = {
                'http':
                'http://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi')),
                'https':
                'https://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi'))
            }
        print 'self.proxies:', self.proxies
        # self.proxies = []

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.ynaic.gov.cn/notice/',
            'searchList':
            'http://gsxt.ynaic.gov.cn/notice/search/ent_info_list',
            'validateCode':
            'http://gsxt.ynaic.gov.cn/notice/captcha?preset=&ra=0.06570781518790503'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
            u'合伙人信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}
Пример #27
0
class HeilongjiangClawer(Crawler):
    """黑龙江工商公示信息网页爬虫 """

    code_cracker = CaptchaRecognition('heilongjiang')
    # 多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.hljaic.gov.cn',
        'get_checkcode':
        'http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0',
        'post_checkcode':
        'http://gsxt.hljaic.gov.cn/checkCheckNo.jspx',
        'get_info_entry':
        'http://gsxt.hljaic.gov.cn/searchList.jspx',
        'ind_comm_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=',
        'ent_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=',
        'other_dept_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/otherDepartment.jspx?id=',
        'judical_assist_skeleton':
        'http://gsxt.hljaic.gov.cn/justiceAssistance.jspx?id=',
        'ind_comm_pub_reg_shareholder':
        'http://gsxt.hljaic.gov.cn/QueryInvList.jspx?',  # 股东信息
        'ind_comm_pub_reg_modify':
        'http://gsxt.hljaic.gov.cn/QueryAltList.jspx?',  # 变更信息翻页
        'ind_comm_pub_arch_key_persons':
        'http://gsxt.hljaic.gov.cn/QueryMemList.jspx?',  # 主要人员信息翻页
        'ind_comm_pub_spot_check':
        'http://gsxt.hljaic.gov.cn/QuerySpotCheckList.jspx?',  # 抽样检查信息翻页
        'ind_comm_pub_movable_property_reg':
        'http://gsxt.hljaic.gov.cn/QueryMortList.jspx?',  # 动产抵押登记信息翻页
        'ind_comm_pub_business_exception':
        'http://gsxt.hljaic.gov.cn/QueryExcList.jspx?',  # 经营异常信息
        'ent_pub_administration_license':
        'http://gsxt.hljaic.gov.cn/QueryLicenseRegList.jspx?',  # 行政许可信息
        'shareholder_detail':
        'http://gsxt.hljaic.gov.cn/queryInvDetailAction.jspx?id=',  # 投资人详情
        'movable_property_reg_detail':
        'http://gsxt.hljaic.gov.cn/mortInfoDetail.jspx?id=',  # 动产抵押登记详情
        'annual_report':
        'http://gsxt.hljaic.gov.cn/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path=None):
        # Crawler.__init__(self)
        super(HeilongjiangClawer, self).__init__()

        self.json_restore_path = json_restore_path
        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/heilongjiang/'

        # 验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/heilongjiang/ckcode.jpg'

        self.parser = HeilongjiangParser(self)

        self.proxies = get_proxy('heilongjiang')

        self.timeout = (30, 20)

    def run(self, _ent):
        """爬取的主函数
        """
        if self.proxies:
            print self.proxies
            self.reqst.proxies = self.proxies
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        return Crawler.run(self, _ent)

    def request_by_method(self, method, url, *args, **kwargs):
        r = None
        try:
            r = self.requests.request(method, url, *args, **kwargs)
        except requests.exceptions.Timeout as err:
            logging.error(u'Getting url: %s timeout. %s .' %
                          (url, err.message))
            return False
        except requests.exceptions.ConnectionError:
            logging.error(u"Getting url:%s connection error ." % (url))
            return False
        except Exception as err:
            logging.error(u'Getting url: %s exception:%s . %s .' %
                          (url, type(err), err.message))
            return False
        if r.status_code == 307:
            print "I am coming ! 307"
            r = self.request_by_method(method, url, *args, **kwargs)
            return r.content
        if r.status_code != 200:
            logging.error(
                u"Something wrong when getting url:%s , status_code=%d", url,
                r.status_code)
            return False
        return r.content

    def analyze_showInfo(self, page):
        """ 判断是否成功搜索页面
            分析 展示页面, 获得搜索到的企业列表
        """
        soup = BeautifulSoup(page, "html5lib")
        divs = soup.find_all("div", {"class": "list"})

        if divs:
            Ent = {}
            count = 0
            for div in divs:
                count += 1
                url = ""
                ent = ""
                link = div.find('li')
                if link and link.find('a') and link.find('a').has_attr('href'):
                    url = link.find('a')['href']
                else:
                    break
                profile = link.find_next_sibling()
                if profile and profile.span:
                    ent = profile.span.get_text().strip()
                name = link.find('a').get_text().strip()
                if name == self._ent:
                    Ent.clear()
                    Ent[ent] = url
                    break
                if count == 3:
                    break
                Ent[ent] = url
            self.ents = Ent
            return True
        else:
            return False

    def crawl_check_page(self):
        """爬取验证码页面,包括下载验证码图片以及破解验证码
        :return true or false
        """
        count = 0
        while count < 10:
            count += 1
            ck_code = self.crack_checkcode()

            data = {'checkNo': ck_code}
            resp = self.reqst.post(self.urls['post_checkcode'],
                                   data=data,
                                   timeout=self.timeout)

            if resp.status_code != 200:
                logging.error("crawl post check page failed! count=%d ." %
                              (count))
                continue
            if resp.content[10] == 't':
                data = {'checkNo': ck_code, 'entName': self._ent}
                resp = self.reqst.post(self.urls['get_info_entry'],
                                       data=data,
                                       timeout=self.timeout)
                if self.analyze_showInfo(resp.text):
                    return True
                else:
                    logging.error(
                        "crawl post check page failed! count = %d ." % (count))
            else:
                logging.error("crawl post check page failed! count = %d ." %
                              (count))
            time.sleep(random.uniform(1, 3))
            print "crawl post check page failed! count = %d ." % (count)
        return False

    def crack_checkcode(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(self.urls['get_checkcode'], timeout=self.timeout)
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            return None

        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()

        return ckcode[1]

    @exe_time
    def crawl_ind_comm_pub_pages(self, *args, **kwargs):
        """爬取工商公示信息
        """
        if not len(args): return
        url = args[0]
        company_id = url.split('?')[1]
        self.company_id = company_id.split("=")[1]
        url = "%s%s" % (self.urls['ind_comm_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url, timeout=self.timeout)
        if resp.status_code != 200:
            logging.error('failed to get ind_comm_pub_skeleton')
        self.parser.parse_ind_comm_pub_pages(resp.content)

    @exe_time
    def crawl_ent_pub_pages(self, *args, **kwargs):
        """爬取企业公示信息
        """
        if not len(args): return
        url = args[0]
        company_id = url.split('?')[1]
        self.company_id = company_id.split("=")[1]
        url = "%s%s" % (self.urls['ent_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url, timeout=self.timeout)
        if resp.status_code != 200:
            logging.error('failed to get ent_pub_skeleton')
        self.parser.parse_ent_pub_pages(resp.content)

    @exe_time
    def crawl_other_dept_pub_pages(self, *args, **kwargs):
        """爬取其他部门公示信息
        """
        if not len(args): return
        url = args[0]
        company_id = url.split('?')[1]
        self.company_id = company_id.split("=")[1]
        url = "%s%s" % (self.urls['other_dept_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url, timeout=self.timeout)
        if resp.status_code != 200:
            logging.error('failed to get other_dept_pub_skeleton')
        self.parser.crawl_other_dept_pub_pages(resp.content)

    @exe_time
    def crawl_judical_assist_pub_pages(self, *args, **kwargs):
        """爬取司法协助信息
        """
        if not len(args): return
        url = args[0]
        company_id = url.split('?')[1]
        self.company_id = company_id.split("=")[1]
        url = "%s%s" % (self.urls['judical_assist_skeleton'], self.company_id)
        resp = self.reqst.get(url, timeout=self.timeout)
        if resp.status_code != 200:
            logging.error('failed to get judical_assist_skeleton')
        self.parser.parse_judical_assist_pub_pages(resp.content)