Python CaptchaRecognition примеры использования

Язык программирования: Python

Пространство имен/Пакет: enterprise.libs.CaptchaRecognition

Класс/Тип: CaptchaRecognition

Примеров на hotexamples.com: 30

Python CaptchaRecognition - 30 примеров найдено. Это лучшие примеры Python кода для enterprise.libs.CaptchaRecognition.CaptchaRecognition, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CaptchaRecognition(27)

Основные методы

CaptchaRecognition (27)

Пример #1

Показать файл

Файл: hebei_crawler.py Проект: xiaohui2856/crawl

    def __init__(self, json_restore_path=None):
        headers = {  #'Connetion': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            "User-Agent": get_user_agent()
        }
        self.CR = CaptchaRecognition("hebei")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.csrf = ""
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/hebei/'

        self.proxies = get_proxy('hebei')

        self.timeout = (30, 20)

Пример #2

Показать файл

Файл: hebei_crawler.py Проект: xiaohui2856/clawer

 def __init__(self, json_restore_path):
     self.CR = CaptchaRecognition("hebei")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.json_restore_path = json_restore_path
     self.csrf = ""
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/hebei/ckcode.jpeg'
     #html数据的存储路径
     self.html_restore_path = settings.json_restore_path + '/hebei/'

Пример #3

Показать файл

 def __init__(self, json_restore_path):
     self.CR = CaptchaRecognition("guangdong")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.main_host = ""
     self.json_dict = {}
     self.json_restore_path = json_restore_path
     self.html_restore_path = settings.json_restore_path + '/hainan/'
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/hainan/ckcode.png'

Пример #4

Показать файл

Файл: guangdong_crawler.py Проект: xiaohui2856/clawer

 def __init__(self, json_restore_path=None):
     self.html_search = None
     self.html_showInfo = None
     self.Captcha = None
     self.CR = CaptchaRecognition("guangdong")
     self.requests = requests.Session()
     self.requests.headers.update(headers)
     self.ents = []
     self.main_host = ""
     self.json_dict = {}
     self.json_restore_path = json_restore_path
     self.dir_restore_path = settings.json_restore_path + '/guangdong/'
     #self.json_restore_path = settings.json_restore_path + '/guangdong.json'
     #验证码图片的存储路径
     self.path_captcha = settings.json_restore_path + '/guangdong/ckcode.jpg'

Пример #5

Показать файл

class FujianCrawler(ZongjuCrawler):
    """福建爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/fujian/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/fujian/ckcode.jpg'
    code_cracker = CaptchaRecognition('fujian')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host': 'http://www.fjaic.gov.cn/',
        'official_site': 'http://wsgs.fjaic.gov.cn/creditpub/home',
        'get_checkcode':
        'http://wsgs.fjaic.gov.cn/creditpub/captcha?preset=math-01',
        'post_checkcode':
        'http://wsgs.fjaic.gov.cn/creditpub/security/verify_captcha',
        'get_info_entry':
        'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list',
        'open_info_entry':
        'http://wsgs.fjaic.gov.cn/creditpub/notice/view?',  #获得企业信息页面的url，通过指定不同的tab=1-4来选择不同的内容（工商公示，企业公示...）
        'open_detail_info_entry': '',
    }

    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = FujianParser(self)

Пример #6

Показать файл

Файл: hunan_crawler.py Проект: xiaohui2856/clawer

class HunanCrawler(ZongjuCrawler):
    """湖南爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/hunan/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/hunan/ckcode.jpg'
    code_cracker = CaptchaRecognition('hunan')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {'host': 'http://www.hnaic.net.cn/visit/category/a/hnaicalllist',
            'official_site': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list',
            'get_checkcode': 'http://gsxt.hnaic.gov.cn/notice/captcha?preset=',
            'post_checkcode': 'http://gsxt.hnaic.gov.cn/notice/search/popup_captcha',

            'get_info_entry': 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list',  # 获得企业入口
            'open_info_entry': 'http://gsxt.hnaic.gov.cn/notice/notice/view?',
            # 获得企业信息页面的url，通过指定不同的tab=1-4来选择不同的内容（工商公示，企业公示...）
            'open_detail_info_entry': '',
            }
    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HunanParser(self)

Пример #7

Показать файл

 def __init__(self, *args, **kwargs):
     self.ckcode_image_path = settings.json_restore_path + '/anhui/ckcode.jpg'
     self.code_cracker = CaptchaRecognition('qinghai')
     if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
         os.makedirs(os.path.dirname(self.ckcode_image_path))
     self.urls = {
         'eareName': 'http://www.ahcredit.gov.cn',
         'search': 'http://www.ahcredit.gov.cn/search.jspx',
         'checkCheckNo': 'http://www.ahcredit.gov.cn/checkCheckNo.jspx',
         'searchList': 'http://www.ahcredit.gov.cn/searchList.jspx',
         'validateCode':
         'http://www.ahcredit.gov.cn/validateCode.jspx?type=0&id=0.22788021906613765',
         'QueryInvList': 'http://www.ahcredit.gov.cn/QueryInvList.jspx?',
         'queryInvDetailAction':
         'http://www.ahcredit.gov.cn/queryInvDetailAction.jspx?',
         'businessPublicity':
         'http://www.ahcredit.gov.cn/businessPublicity.jspx?',
         'enterprisePublicity':
         'http://www.ahcredit.gov.cn/enterprisePublicity.jspx?',
         'otherDepartment':
         'http://www.ahcredit.gov.cn/otherDepartment.jspx?',
         'justiceAssistance':
         'http://www.ahcredit.gov.cn/justiceAssistance.jspx?'
     }
     self.timeout = 30
     self.result_json = {}
     self.result_json_list = []
     pass

Пример #8

Показать файл

Файл: chongqing_crawler.py Проект: xiaohui2856/clawer

    def __init__(self, json_restore_path):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径，所有重庆的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
            需要在写入文件的时候加锁
        Returns:
        """
        # json 数据集
        # POST

        self.json_restore_path = json_restore_path
        if os.path.exists(self.json_restore_path) is False:
            os.makedirs(self.json_restore_path, 0775)
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path, "/chongqing/")
        if os.path.exists(self.html_restore_path) is False:
            os.makedirs(self.html_restore_path, 0775)
        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.ent_number = None
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.parser = ChongqingParser(self)
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'})

Пример #9

Показать файл

Файл: chongqing_crawler.py Проект: xiaohui2856/clawer

 def setUp(self):
     unittest.TestCase.setUp(self)
     from CaptchaRecognition import CaptchaRecognition
     self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json')
     self.parser = self.crawler.parser
     ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
     self.crawler.json_dict = {}
     self.crawler.ent_number = '500232000003942'

Пример #10

Показать файл

Файл: hubei_crawler.py Проект: xiaohui2856/clawer

class HubeiCrawler(HeilongjiangClawer):
    """湖北爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/hubei/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/hubei/ckcode.jpg'
    code_cracker = CaptchaRecognition('hubei')

    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.hljaic.gov.cn',
        'get_checkcode':
        'http://xyjg.egs.gov.cn/ECPS_HB/validateCode.jspx?type=0',
        'post_checkcode':
        'http://xyjg.egs.gov.cn/ECPS_HB/checkCheckNo.jspx',
        'get_info_entry':
        'http://xyjg.egs.gov.cn/ECPS_HB/searchList.jspx',
        'ind_comm_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/businessPublicity.jspx?id=',
        'ent_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/enterprisePublicity.jspx?id=',
        'other_dept_pub_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/otherDepartment.jspx?id=',
        'judical_assist_skeleton':
        'http://xyjg.egs.gov.cn/ECPS_HB/justiceAssistance.jspx?id=',
        'ind_comm_pub_reg_shareholder':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryInvList.jspx?',  # 股东信息
        'ind_comm_pub_reg_modify':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryAltList.jspx?',  # 变更信息翻页
        'ind_comm_pub_arch_key_persons':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryMemList.jspx?',  # 主要人员信息翻页
        'ind_comm_pub_spot_check':
        'http://xyjg.egs.gov.cn/ECPS_HB/QuerySpotCheckList.jspx?',  # 抽样检查信息翻页
        'ind_comm_pub_movable_property_reg':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryMortList.jspx?',  # 动产抵押登记信息翻页
        'ind_comm_pub_business_exception':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryExcList.jspx?',  # 经营异常信息
        'ind_comm_pub_equity_ownership_reg':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryPledgeList.jspx?',  # 股权出质登记信息翻页
        'ind_comm_pub_arch_branch':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryChildList.jspx?',  # 分支机构信息
        'shareholder_detail':
        'http://xyjg.egs.gov.cn/ECPS_HB/queryInvDetailAction.jspx?id=',  # 投资人详情
        'movable_property_reg_detail':
        'http://xyjg.egs.gov.cn/ECPS_HB/mortInfoDetail.jspx?id=',  # 动产抵押登记详情
        'annual_report':
        'http://xyjg.egs.gov.cn/ECPS_HB/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HubeiParser(self)

Пример #11

Показать файл

Файл: neimenggu_crawler.py Проект: xiaohui2856/crawl

    def __init__(self, json_restore_path=None):
        self.html_showInfo = None
        self.Captcha = None
        self.CR = CaptchaRecognition("guangdong")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_restore_path = json_restore_path
        self.dir_restore_path = self.json_restore_path + '/neimenggu/'
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/neimenggu/ckcode.jpg'
        self.timeout = (30, 20)
        proxies = get_proxy('neimenggu')
        if proxies:
            print proxies
            self.requests.proxies = proxies

Пример #12

Показать файл

    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径，所有重庆的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
            需要在写入文件的时候加锁
        Returns:
        """
        super(ChongqingCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path,
                                              "chongqing")

        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path,
                                              'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        self.ent_number = None
        self.ents = {}
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None

Пример #13

Показать файл

Файл: tt_jiangsu_crawler.py Проект: xiaohui2856/crawl

    def __init__(self, *args, **kwargs):
        """江苏工商公示信息网页爬虫初始化函数
		Args:
			json_restore_path: json文件的存储路径，所有江苏的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
			 需要在写入文件的时候加锁
		Returns:
		"""
        self.ent_number = None
        #html数据的存储路径
        self.html_restore_path = settings.json_restore_path + '/jiangsu/'
        # 验证码图片的存储路径
        self.ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg'
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        self.code_cracker = CaptchaRecognition('jiangsu')
        #多线程爬取时往最后的json文件中写时的加锁保护
        self.write_file_mutex = threading.Lock()

        self.urls = {
            'host': 'www.jsgsj.gov.cn',
            'official_site': 'http://www.jsgsj.gov.cn:58888/province/',
            'get_checkcode':
            'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7',
            'post_checkcode':
            'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true',
            'ind_comm_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp',
            'ent_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp',
            'other_dept_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp',
            'judical_assist_pub_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp',
            'annual_report_skeleton':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp',
            'ci_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true',
            'common_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true',
            'nb_enter':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true',
            'ci_detail':
            'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true'
        }
        self.result_json = {}
        self.result_json_list = []

Пример #14

Показать файл

Файл: tt_zongju_crawler.py Проект: xiaohui2856/crawl

    def __init__(self, *args, **kwargs):
        # 验证码图片的存储路径
        self.ckcode_image_path = settings.json_restore_path + '/zongju/ckcode.jpg'

        self.code_cracker = CaptchaRecognition('zongju')
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        # 多线程爬取时往最后的json文件中写时的加锁保护
        self.write_file_mutex = threading.Lock()
        self.timeout = 40
        self.urls = {
            'host': 'http://qyxy.baic.gov.cn',
            'official_site': 'http://gsxt.saic.gov.cn/zjgs/',
            'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=',
            'post_checkcode':
            'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',
            # 'get_info_entry': 'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',  # 获得企业入口
            'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?',
            # 获得企业信息页面的url，通过指定不同的tab=1-4来选择不同的内容（工商公示，企业公示...）
            'open_detail_info_entry': ''
        }

Пример #15

Показать файл

class ChongqingCrawler(Crawler):
    """重庆工商公示信息网页爬虫，集成Crawler基类 """

    # 多线程爬取时往最后的json文件中写时的加锁保护
    urls = {
        'host': 'http://gsxt.cqgs.gov.cn',
        'get_checkcode':
        'http://gsxt.cqgs.gov.cn/sc.action?width=130&height=40',
        'repost_checkcode': 'http://gsxt.cqgs.gov.cn/search_research.action',
        # 获得查询页面
        'post_checkcode': 'http://gsxt.cqgs.gov.cn/search.action',
        # 根据查询页面获得指定公司的数据
        'search_ent': 'http://gsxt.cqgs.gov.cn/search_getEnt.action',
        # 年报
        'year_report': 'http://gsxt.cqgs.gov.cn/search_getYearReport.action',
        # 年报详情
        'year_report_detail':
        'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action',
        # 股权变更
        'year_daily_transinfo':
        'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 股东出资信息
        'year_daily_invsub': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 行政处罚
        'year_daily_peninfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 行政许可
        'year_daily_licinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 知识产权出质登记
        'year_daily_pleinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 其他行政许可信息
        'other_qlicinfo':
        'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action',
        # 其他行政处罚
        'other_qpeninfo':
        'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action',
        # 股权冻结信息
        'sfxz_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZ.action',
        # 股东变更信息
        'sfxzgdbg_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZGDBG.action',
    }
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径，所有重庆的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
            需要在写入文件的时候加锁
        Returns:
        """
        super(ChongqingCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path,
                                              "chongqing")

        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path,
                                              'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        self.ent_number = None
        self.ents = {}
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None

    def run(self, ent_number):
        print self.__class__.__name__
        logging.error('crawl %s .', self.__class__.__name__)
        self.ent_number = str(ent_number)
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        self.crawl_check_page()
        if not self.ents:
            return json.dumps([{self.ent_number: None}])
        data = self.crawl_main_page()
        return json.dumps(data)

    def analyze_showInfo(self, page):
        # 解析供查询页面, 获得工商信息页面POST值
        soup = BeautifulSoup(page, "html5lib")
        result = soup.find('div', {'id': 'result'})
        if result is None:
            return None
        items = result.find_all('div', {'class': 'item'})
        if items:
            count = 0
            Ent = {}
            for item in items:
                count += 1
                key_map = {}
                link = item.find('a')
                entId = link.get('data-entid')
                types = link.get('data-type')
                ids = link.get('data-id')
                name = link.get_text().strip()
                key_map['entId'] = entId
                key_map['type'] = types
                key_map['id'] = ids
                key_map['name'] = name
                profile = item.find('span', attrs={
                    'class': 'value'
                }).get_text().strip()
                if name == self.ent_number:
                    Ent.clear()
                    Ent[profile] = key_map
                    break
                if key_map is not None:
                    Ent[profile] = key_map
                if count == 3:
                    break
            self.ents = Ent
            return True
        return False

    def crawl_check_page(self):
        """爬取验证码页面，包括下载验证码图片以及破解验证码
        :return true or false
        """
        count = 0
        while count < 30:
            count += 1
            ck_code = self.crack_check_code()
            data = {'key': self.ent_number, 'code': ck_code}
            resp = self.reqst.post(ChongqingCrawler.urls['post_checkcode'],
                                   data=data)
            if resp.status_code != 200:
                logging.error("crawl post check page failed!")
                continue
            if self.analyze_showInfo(resp.content):
                return True
            time.sleep(random.uniform(1, 3))
        return False

    def crack_check_code(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(ChongqingCrawler.urls['get_checkcode'])
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            return None
        time.sleep(random.uniform(0.1, 0.2))
        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)

        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        self.write_file_mutex.release()
        return ckcode[1]

    def crawl_main_page(self):
        """获取所有界面的json数据"""
        sub_json_list = []
        for ent, data in self.ents.items():
            self.json_dict = {}
            try:
                if data is not None:
                    self.json_ent_info = None
                    self.json_sfxzgdbg = None
                    self.json_sfxz = None
                    self.json_other_qlicinfo = None
                    self.json_other_qpeninfo = None
                    self.json_year_report = None
                    self.json_year_report_detail = []
                    self.json_year_daily_transinfo = None
                    self.json_year_daily_invsub = None
                    self.json_year_daily_peninfo = None
                    self.json_year_daily_licinfo = None
                    self.json_year_daily_pleinfo = None

                    self.crawl_ent_info_json(data)
                    self.crawl_year_report_json(data)
                    self.crawl_year_report_detail_json(data)
                    time.sleep(0.1)
                    self.crawl_sfxzgdbg_json(data)
                    time.sleep(0.1)
                    self.crawl_sfxz_json(data)
                    time.sleep(0.1)
                    self.crawl_year_daily_invsub_json(data)
                    time.sleep(0.1)
                    self.crawl_year_daily_licinfo_json(data)
                    time.sleep(0.1)
                    self.crawl_year_daily_peninfo_json(data)
                    time.sleep(0.1)
                    self.crawl_year_daily_transinfo_json(data)
                    time.sleep(0.1)
                    self.crawl_year_daily_pleinfo_json(data)
                    time.sleep(0.1)
                    self.crawl_other_qpeninfo_json(data)
                    time.sleep(0.1)
                    self.crawl_other_qlicinfo_json(data)
                else:
                    continue
                self.parser.parse_jsons()
                self.parser.merge_jsons()
            except Exception as e:
                logging.error('%s .' % (traceback.format_exc(10)))
            sub_json_list.append({ent: self.json_dict})
        return sub_json_list

    def crawl_ent_info_json(self, data, type=1):
        """企业详细信息"""
        params = {
            'entId': data.get('entId'),
            'id': data.get('id'),
            'type': type
        }
        json_data = self.reqst.get(ChongqingCrawler.urls['search_ent'],
                                   params=params)
        if json_data.status_code == 200:
            json_data = json_data.content
            json_data = str(json_data)
            self.json_ent_info = json_data[6:]  # 去掉数据中的前六个字符保证数据为完整json格式数据
            if self.json_ent_info is None or 'base' not in self.json_ent_info:
                self.crawl_ent_info_json(data, type=10)  # 有些公司需要传过去的参数为 10
                # print(self.json_ent_info)

    def crawl_year_report_json(self, data):
        """年报数据"""
        params = {'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingCrawler.urls['year_report'],
                                   params=params)
        while json_data.status_code != 200:
            json_data = self.reqst.get(ChongqingCrawler.urls['year_report'],
                                       params=params)
        json_data = json_data.content
        json_data = str(json_data)
        self.json_year_report = json_data[6:]  # 去掉数据中的前六个字符保证数据为完整json格式数据
        # print(self.json_year_report)

    def crawl_year_report_detail_json(self, data):
        """详细年报"""
        # TO DO 需要获得 year_report 中的年份信息
        while self.json_year_report is None:
            self.crawl_year_report_json(data)
        year_report = json.loads(self.json_year_report, encoding='utf-8')

        histories = year_report.get('history')
        for i in range(len(histories)):
            sub_json_dict = {}
            sub_json_dict.update(histories[i])
            year = histories[i].get('year')
            params = {'id': data.get('id'), 'type': 1, 'year': str(year)}
            json_data = self.reqst.get(
                ChongqingCrawler.urls['year_report_detail'], params=params)
            if json_data.status_code == 200:
                # 此页面响应结果直接就是 json_data
                sub_json_dict['detail'] = json.loads(str(json_data.content))
            self.json_year_report_detail.append(sub_json_dict)
        # print(self.json_year_report_detail)

    def crawl_year_daily_transinfo_json(self, data):
        """股权变更"""
        params = {'id': data.get('id'), 'jtype': 'transinfo'}
        json_data = self.reqst.get(
            ChongqingCrawler.urls['year_daily_transinfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_transinfo = json_data[6:]
            # print(self.json_year_daily_transinfo)

    def crawl_year_daily_pleinfo_json(self, data):
        """行政许可"""
        params = {'id': data.get('id'), 'jtype': 'pleinfo'}
        json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_pleinfo'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_pleinfo = json_data[6:]
            # print(self.json_year_daily_pleinfo)

    def crawl_year_daily_invsub_json(self, data):
        """股东出资信息"""
        params = {'id': data.get('id'), 'jtype': 'invsub'}
        json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_invsub'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_invsub = json_data[6:]
            # print(self.json_year_daily_invsub)

    def crawl_year_daily_licinfo_json(self, data):
        """行政许可"""
        params = {'id': data.get('id'), 'jtype': 'licinfo'}
        json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_licinfo'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_licinfo = json_data[6:]
            # print(self.json_year_daily_licinfo)

    def crawl_year_daily_peninfo_json(self, data):
        """行政处罚"""
        params = {'id': data.get('id'), 'jtype': 'peninfo'}
        json_data = self.reqst.get(ChongqingCrawler.urls['year_daily_peninfo'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_peninfo = json_data[6:]
            # print(self.json_year_daily_peninfo)

    def crawl_sfxzgdbg_json(self, data):
        """股东变更信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingCrawler.urls['sfxzgdbg_page'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_sfxzgdbg = json_data[6:]
            # print(self.json_sfxzgdbg)

    def crawl_sfxz_json(self, data):
        """股权冻结信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingCrawler.urls['sfxz_page'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_sfxz = json_data[6:]
            # print(self.json_sfxz)

    def crawl_other_qlicinfo_json(self, data):
        """股东出资信息"""
        params = {
            'entId': data.get('entId'),
            'id': data.get('id'),
            'qtype': 'Qlicinfo',
            'type': 1
        }
        json_data = self.reqst.get(ChongqingCrawler.urls['other_qlicinfo'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_other_qlicinfo = json_data[6:]
            # print(self.json_other_qlicinfo)

    def crawl_other_qpeninfo_json(self, data):
        """股东出资信息"""
        params = {
            'entId': data.get('entId'),
            'id': data.get('id'),
            'qtype': 'Qpeninfo',
            'type': 1
        }
        json_data = self.reqst.get(ChongqingCrawler.urls['other_qpeninfo'],
                                   params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_other_qpeninfo = json_data[6:]

Пример #16

Показать файл

    def __init__(self, json_restore_path=None):
        self.cur_time = str(int(time.time() * 1000))
        self.nbxh = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.html_restore_path = self.json_restore_path + '/guizhou/'
        self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guizhou')
        self.result_json_dict = {}
        self.reqst.headers.update({
            'Connection': "keep-alive",
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': get_user_agent()
        })

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.gzgs.gov.cn/',
            'searchList':
            'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml',
            'validateCode':
            'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}

Пример #17

Показать файл

class GuizhouCrawler(object):
    """ 贵州省爬虫, 单独爬取 """
    #html数据的存储路径
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path=None):
        self.cur_time = str(int(time.time() * 1000))
        self.nbxh = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.html_restore_path = self.json_restore_path + '/guizhou/'
        self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guizhou')
        self.result_json_dict = {}
        self.reqst.headers.update({
            'Connection': "keep-alive",
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': get_user_agent()
        })

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.gzgs.gov.cn/',
            'searchList':
            'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml',
            'validateCode':
            'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}

    def get_check_num(self):
        # print self.mydict['search']
        resp = None
        search_count = 0
        while search_count < 5:
            try:
                resp = self.reqst.get(self.mydict['search'])
            except:
                search_count += 1
                continue
            if resp.status_code == 200:
                break
            else:
                search_count += 1
                continue
        if resp.status_code != 200:
            return None

        validate_count = 0
        while validate_count < 5:
            try:
                resp = self.reqst.get(self.mydict['validateCode'] +
                                      self.cur_time)
            except:
                validate_count += 1
                continue
            if resp.status_code == 200:
                break
            else:
                validate_count += 1
                continue
        if resp.status_code != 200:
            # print 'no validateCode'
            return None
        # print self.ckcode_image_path
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)

        ck_code = self.code_cracker.predict_result(self.ckcode_image_path)

        # return ck_code[1]
        if not ck_code is None:
            return ck_code[1]
        else:
            return None

    def send_post_for_enter(self, host, nbxh, c, t, lsh):
        count = 0
        while count < 10:
            data = {'nbxh': nbxh, 'c': c, 't': t, 'lsh': lsh}
            try:
                resp = self.reqst.post(host, data=data)
            except:
                count += 1
                continue
            if resp.status_code == 200:
                return resp.content
            else:
                count += 1
                continue

    def get_dict_enter(self, allths, alltds, alltds_keys):
        alltds = json.loads(alltds)
        # print alltds_keys
        if not alltds[u'data']:
            return []
        else:
            temp_alltds = []
            for item in alltds[u'data']:
                for key in alltds_keys:
                    if item[key] is False or item[key] == '' or item[
                            key] == None:
                        temp_alltds.append(item[key])
                    else:
                        temp_alltds.append(str(item[key]).strip())
            return self.get_one_to_one_dict(allths, temp_alltds)

    def help_get_dict_form_enter(self, lsh):
        needdict = {}
        result_dict = self.send_post_for_enter(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh,
            '0', '14', lsh)
        # print result_dict
        value = self.get_dict_enter(
            allths=[
                u'注册号/统一社会信用代码', u'企业名称', u'企业联系电话', u'邮政编码', u'企业通信地址',
                u'企业电子邮箱', u'有限责任公司本年度是否发生股东股权转让', u'企业经营状态', u'是否有网站或网店',
                u'是否有投资信息或购买其他公司股权', u'从业人数'
            ],
            alltds=result_dict,
            alltds_keys=[
                u'zch', u'qymc', u'lxdh', u'yzbm', u'dz', u'dzyx', u'sfzr',
                u'jyzt', u'sfww', u'sfdw', u'cyrs'
            ],
        )
        needdict[u'企业基本信息'] = value[0]

        result_dict = self.send_post_for_enter(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh,
            '0', '15', lsh)
        value = self.get_dict_enter(
            allths=[u'类型', u'名称', u'网址'],
            alltds=result_dict,
            alltds_keys=[],
        )
        needdict[u'网站或网店信息'] = value

        result_dict = self.send_post_for_enter(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh,
            '0', '19', lsh)
        value = self.get_dict_enter(allths=[
            u'注册号/股东', u'认缴出资额（万元）', u'认缴出资时间', u'认缴出资方式', u'实缴出资额（万元）',
            u'出资时间', u'出资方式'
        ],
                                    alltds=result_dict,
                                    alltds_keys=[
                                        u'tzr', u'rjcze', u'rjczrq', u'rjczfs',
                                        u'sjcze', u'sjczrq', u'sjczfs'
                                    ])
        needdict[u'股东及出资信息'] = value

        result_dict = self.send_post_for_enter(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh,
            '0', '16', lsh)
        value = self.get_dict_enter(allths=[
            u'资产总额', u'所有者权益合计', u'销售总额', u'利润总额', u'销售总额中主营业务收入', u'净利润',
            u'纳税总额', u'负债总额'
        ],
                                    alltds=result_dict,
                                    alltds_keys=[
                                        u'zcze', u'qyhj', u'xsze', u'lrze',
                                        u'zysr', u'lrze', u'nsze', u'fzze'
                                    ])
        needdict[u'企业资产状况信息'] = value[0]

        result_dict = self.send_post_for_enter(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchNbxx.shtml', self.nbxh,
            '0', '41', lsh)
        value = self.get_dict_enter(
            allths=[u'序号', u'修改事项', u'修改前', u'修改后', u'修改日期'],
            alltds=result_dict,
            alltds_keys=[u'rownum', u'bgsxmc', u'bgq', u'bgh', u'bgrq'])
        needdict[u'修改记录'] = value
        needdict[u'对外投资信息'] = []
        needdict[u'对外提供保证担保信息'] = []
        needdict[u'股权变更信息'] = []
        return needdict

    def get_id_num(self, findCode):
        count = 0
        while count < 20:
            count += 1
            # print self.cur_time
            yzm = self.get_check_num()
            data = {'q': findCode, 'validCode': yzm}

            resp = self.reqst.post(self.mydict['searchList'], data=data)
            if resp.status_code != 200:
                logging.error("status code of searchList page is not 200.")
                time.sleep(random.uniform(1, 3))
                continue
            result_dict = json.loads(resp.content)
            # print result_dict
            if result_dict['successed'] == True:
                try:
                    datas = result_dict.get('data')
                    if not datas:
                        return False
                    # 只取前三个
                    self.ents = datas[0:3]
                    return True
                except:
                    return False
                break
            else:
                logging.error('The count is %d.' % (count))
        return False

    def get_one_to_one_dict(self, allths, alltds):
        # if len(allths) == len(alltds):
        # 	one_to_one_dict = {}
        # 	for key, value in zip(allths, alltds):
        # 		one_to_one_dict[key] = value
        # 	return one_to_one_dict
        # else:
        templist = []
        x = 0
        y = x + len(allths)
        while y <= len(alltds):
            tempdict = {}
            for keys, values in zip(allths, alltds[x:y]):
                tempdict[keys] = values
            x = y
            y = x + len(allths)
            templist.append(tempdict)
        return templist

    def test_print_table(self, tables):
        for table in tables:
            print table

    def test_print_all_ths_tds(self, head, allths, alltds):
        print '--------------', head, '--------------'
        for th in allths:
            print th
        for td in alltds:
            print td

    def test_print_all_dict(self, mydict):
        for key, value in mydict.items():
            print key, ':', value

    def get_json_one(self, allths, alltds, alltds_keys, head):
        alltds = json.loads(alltds)
        # print alltds
        if not alltds[u'data']:
            self.result_json_dict[head] = []
        else:
            temp_alltds = []
            for item in alltds[u'data']:
                for key in alltds_keys:
                    temp_alltds.append(item[key])
                if head == 'ind_comm_pub_reg_shareholder':
                    temp_alltds.append(None)
            if head == u'ind_comm_pub_reg_basic' or head == u'ind_comm_pub_arch_liquidation':
                self.result_json_dict[head] = self.get_one_to_one_dict(
                    allths, temp_alltds)[0]
            else:
                self.result_json_dict[head] = self.get_one_to_one_dict(
                    allths, temp_alltds)

        pass

    def get_json_two(self, allths, alltds, alltds_keys, head):
        alltds = json.loads(alltds)
        # print alltds_keys
        if not alltds[u'data']:
            self.result_json_dict[head] = []
        else:
            temp_alltds = []
            for item in alltds[u'data']:
                for key in alltds_keys:
                    if head == u'ent_pub_ent_annual_report' and key == 'lsh':
                        if item[key] is False or item[key] == '' or item[
                                key] == None:
                            temp_alltds.append(None)
                        else:
                            temp_alltds.append(
                                self.help_get_dict_form_enter(item[key]))
                    elif head == u'ent_pub_administration_license' and key == 'lsh':
                        if item[key] is False or item[key] == '' or item[
                                key] == None:
                            temp_alltds.append(None)
                        else:
                            temp_alltds.append([])
                    else:
                        temp_alltds.append(item[key])
            self.result_json_dict[head] = self.get_one_to_one_dict(
                allths, temp_alltds)
        # if not alltds
        pass

    def get_json_three(self, allths, alltds, alltds_keys, head):
        alltds = json.loads(alltds)
        # print alltds_keys
        if not alltds[u'data']:
            self.result_json_dict[head] = []
        else:
            temp_alltds = []
            for item in alltds[u'data']:
                for key in alltds_keys:
                    temp_alltds.append(item[key])
            self.result_json_dict[head] = self.get_one_to_one_dict(
                allths, temp_alltds)
        pass

    def get_json_four(self, allths, alltds, alltds_keys, head):
        alltds = json.loads(alltds)
        # print alltds_keys
        if not alltds[u'data']:
            self.result_json_dict[head] = []
        else:
            temp_alltds = []
            for item in alltds[u'data']:
                for key in alltds_keys:
                    temp_alltds.append(item[key])
            self.result_json_dict[head] = self.get_one_to_one_dict(
                allths, temp_alltds)
        pass

    def send_post(self, host, nbxh, c, t):
        count = 0
        while count < 10:
            data = {'nbxh': nbxh, 'c': c, 't': t}
            try:
                resp = self.reqst.post(host, data=data)
            except:
                count += 1
                continue
            if resp.status_code == 200:
                return resp.content
            else:
                count += 1
                continue

    def run(self, findCode):
        print self.__class__.__name__
        logging.error('crawl %s .', self.__class__.__name__)
        self.ent_number = str(findCode)
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        result = self.get_id_num(self.ent_number)
        if not result:
            return json.dumps([{self.ent_number: None}])
        json_list = []
        for item in self.ents:
            self.nbxh = item.get('nbxh')
            zch = item.get('zch')
            self.result_json_dict = {}

            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '5')
            # print result_dict
            self.get_json_one(allths=[
                u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期',
                u'住所', u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态'
            ],
                              alltds=result_dict,
                              alltds_keys=[
                                  u'zch', u'qymc', u'qylxmc', u'fddbr',
                                  u'zczb', u'clrq', u'zs', u'yyrq1', u'yyrq2',
                                  u'jyfw', u'djjgmc', u'hzrq', u'mclxmc'
                              ],
                              head='ind_comm_pub_reg_basic')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '3')
            print result_dict
            self.get_json_one(
                allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'],
                alltds=result_dict,
                alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'],
                head='ind_comm_pub_reg_modify')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '2', '3')
            # print result_dict
            self.get_json_one(
                allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'],
                alltds=result_dict,
                alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'],
                head='ind_comm_pub_reg_shareholder')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '8')
            # print result_dict
            self.get_json_one(allths=[u'序号', u'姓名', u'职务'],
                              alltds=result_dict,
                              alltds_keys=[u'rownum', u'xm', u'zwmc'],
                              head='ind_comm_pub_arch_key_persons')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '36')
            # print result_dict
            self.get_json_one(allths=[u'清算负责人', u'清算组成员'],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_arch_liquidation')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '9')
            # print result_dict
            self.get_json_one(
                allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'],
                alltds=result_dict,
                alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'],
                head='ind_comm_pub_arch_branch')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '25')
            # print result_dict
            self.get_json_one(allths=[
                u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期',
                u'详情'
            ],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_movable_property_reg')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '4')
            # print result_dict
            self.get_json_one(allths=[
                u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人',
                u'证照/证件号码', u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况'
            ],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_equity_ownership_reg')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '1')
            # print result_dict
            self.get_json_one(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_administration_sanction')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '33')
            # print result_dict
            self.get_json_one(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_business_exception')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '34')
            # print result_dict
            self.get_json_one(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_serious_violate_law')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '35')
            # print result_dict
            self.get_json_one(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ind_comm_pub_spot_check')

            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '13')
            # print result_dict
            self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'],
                              alltds=result_dict,
                              alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'],
                              head='ent_pub_ent_annual_report')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '40')
            # print result_dict
            self.get_json_two(allths=[
                u'股东', u'认缴额（万元）', u'实缴额（万元）', u'认缴出资方式', u'认缴出资额（万元）',
                u'认缴出资日期', u'认缴公示日期', u'实缴出资方式', u'实缴出资额（万元）', u'实缴出资日期',
                u'实缴公示日期'
            ],
                              alltds=result_dict,
                              alltds_keys=[
                                  u'tzrmc', u'ljrje', u'ljsje', u'rjczfs',
                                  u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs',
                                  u'sjcze', u'sjczrq', u'sjgsrq'
                              ],
                              head='ent_pub_shareholder_capital_contribution')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '23')
            # print result_dict
            self.get_json_two(allths=[
                u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'
            ],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ent_pub_equity_change')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '20')
            # print result_dict
            self.get_json_two(allths=[
                u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关',
                u'许可内容', u'状态', u'公示日期', u'详情'
            ],
                              alltds=result_dict,
                              alltds_keys=[
                                  u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx',
                                  u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq',
                                  u'lsh'
                              ],
                              head='ent_pub_administration_license')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '21')
            # print result_dict
            self.get_json_two(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ent_pub_knowledge_property')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '22')
            # print result_dict
            self.get_json_two(allths=[],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='ent_pub_shareholder_modify')

            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml',
                self.nbxh, '0', '37')
            # print result_dict
            self.get_json_three(allths=[
                u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关',
                u'许可内容', u'状态', u'详情'
            ],
                                alltds=result_dict,
                                alltds_keys=[
                                    u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1',
                                    u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt',
                                    u'zt'
                                ],
                                head='other_dept_pub_administration_license')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml',
                self.nbxh, '0', '38')
            # print result_dict
            self.get_json_two(allths=[
                u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称',
                u'作出行政处罚决定日期'
            ],
                              alltds=result_dict,
                              alltds_keys=[],
                              head='other_dept_pub_administration_sanction')

            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '49')
            # print result_dict
            self.get_json_four(allths=[
                u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情'
            ],
                               alltds=result_dict,
                               alltds_keys=[],
                               head='judical_assist_pub_equity_freeze')
            result_dict = self.send_post(
                'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml',
                self.nbxh, '0', '53')
            # print result_dict
            self.get_json_four(
                allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'],
                alltds=result_dict,
                alltds_keys=[],
                head='judical_assist_pub_shareholder_modify')
            json_list.append({zch: self.result_json_dict})
        return json.dumps(json_list)

Пример #18

Показать файл

Файл: sichuan_crawler.py Проект: xiaohui2856/crawl

    def __init__(self, json_restore_path=None):
        self.pripid = None
        self.cur_time = str(int(time.time() * 1000))
        self.reqst = requests.Session()
        self.reqst.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/sichuan/'
        self.code_cracker = CaptchaRecognition('sichuan')
        self.result_json_dict = {}
        self.json_list = []

        proxies = get_proxy('shaanxi')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.ents = {}

        self.mydict = {
            'eareName': 'http://www.ahcredit.gov.cn',
            'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=',
            'searchList':
            'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=',
            'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm'
        }

        self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic',
                         u'股东信息': 'ind_comm_pub_reg_shareholder',
                         u'发起人信息': 'ind_comm_pub_reg_shareholder',
                         u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
                         u'变更信息': 'ind_comm_pub_reg_modify',
                         u'主要人员信息': 'ind_comm_pub_arch_key_persons',
                         u'分支机构信息': 'ind_comm_pub_arch_branch',
                         u'清算信息': 'ind_comm_pub_arch_liquidation',
                         u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
                         u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'行政处罚信息': 'ind_comm_pub_administration_sanction',
                         u'经营异常信息': 'ind_comm_pub_business_exception',
                         u'严重违法信息': 'ind_comm_pub_serious_violate_law',
                         u'抽查检查信息': 'ind_comm_pub_spot_check'}

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license',
                           u'行政处罚信息': 'other_dept_pub_administration_sanction'}
        self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'股东变更信息': 'judical_assist_pub_shareholder_modify',
                          u'司法股东变更登记信息':
                          'judical_assist_pub_shareholder_modify'}

Пример #19

Показать файл

    def __init__(self, json_restore_path):
        self.id = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg'
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        self.result_json_dict = {}
        self.code_cracker = CaptchaRecognition('yunnan')
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })

        useproxy = UseProxy()
        is_use_proxy = useproxy.get_province_is_use_proxy(province='guangxi')
        if not is_use_proxy:
            self.proxies = []
        else:
            proxy = Proxy()
            self.proxies = {
                'http':
                'http://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi')),
                'https':
                'https://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi'))
            }
        print 'self.proxies:', self.proxies
        # self.proxies = []

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.ynaic.gov.cn/notice/',
            'searchList':
            'http://gsxt.ynaic.gov.cn/notice/search/ent_info_list',
            'validateCode':
            'http://gsxt.ynaic.gov.cn/notice/captcha?preset=&ra=0.06570781518790503'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
            u'合伙人信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}

Пример #20

Показать файл

class YunnanCrawler(object):
    ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg'

    def __init__(self, json_restore_path):
        self.id = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = settings.json_restore_path + '/yunnan/ckcode.jpg'
        if not os.path.exists(os.path.dirname(self.ckcode_image_path)):
            os.makedirs(os.path.dirname(self.ckcode_image_path))
        self.result_json_dict = {}
        self.code_cracker = CaptchaRecognition('yunnan')
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })

        useproxy = UseProxy()
        is_use_proxy = useproxy.get_province_is_use_proxy(province='guangxi')
        if not is_use_proxy:
            self.proxies = []
        else:
            proxy = Proxy()
            self.proxies = {
                'http':
                'http://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi')),
                'https':
                'https://' +
                random.choice(proxy.get_proxy(num=5, province='guangxi'))
            }
        print 'self.proxies:', self.proxies
        # self.proxies = []

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.ynaic.gov.cn/notice/',
            'searchList':
            'http://gsxt.ynaic.gov.cn/notice/search/ent_info_list',
            'validateCode':
            'http://gsxt.ynaic.gov.cn/notice/captcha?preset=&ra=0.06570781518790503'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
            u'合伙人信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}

    def get_check_num(self):
        resp = self.reqst.get(self.mydict['search'],
                              proxies=self.proxies,
                              timeout=120)
        if resp.status_code != 200:
            return None
        first = resp.content.find('session.token":')
        session_token = resp.content[first + 17:first + 53]

        resp = self.reqst.get(self.mydict['validateCode'],
                              proxies=self.proxies,
                              timeout=120)
        if resp.status_code != 200:
            # print 'no validateCode'
            return None
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)

        ck_code = self.code_cracker.predict_result(self.ckcode_image_path)
        if ck_code is None:
            return None, None
        else:
            return ck_code[1], session_token

    def get_id_num(self, findCode):
        count = 0
        while count < 20:
            check_num, session_token = self.get_check_num()
            # print check_num
            if check_num is None:
                count += 1
                continue
            data = {
                'searchType': '1',
                'captcha': check_num,
                "session.token": session_token,
                'condition.keyword': findCode
            }
            resp = self.reqst.post(self.mydict['searchList'],
                                   data=data,
                                   proxies=self.proxies,
                                   timeout=120)
            if resp.status_code != 200:
                # print resp.status_code
                # print 'error...(get_id_num)'
                count += 1
                continue
            else:
                try:
                    soup = BeautifulSoup(resp.content, 'html.parser').find_all(
                        'div', attrs={'class': 'link'})[0]
                    hrefa = soup.find('a', attrs={'target': '_blank'})
                    if hrefa:
                        self.after_crack_checkcode_page = resp.content
                        return True
                        # return hrefa['href'].split('&')[0]
                    else:
                        count += 1
                        continue
                except:
                    return None

    def get_re_list_from_content(self, content):
        m = re.search(r'investor\.invName = \"(.+)\"', content)
        one = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invt\.subConAm = \"(.+)\"', content)
        five = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invt\.conDate = [\"|\'](.+)[\"|\']', content)
        six = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invt\.conForm = [\"|\'](.+)[\"|\']', content)
        four = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invtActl\.acConAm = [\"|\'](.+)[\"|\']', content)
        eight = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invtActl\.conDate = [\"|\'](.+)[\"|\']', content)
        nigh = unicode(m.group(1), 'utf8') if m else None
        m = re.search(r'invtActl\.conForm = [\"|\'](.+)[\"|\']', content)
        seven = unicode(m.group(1), 'utf8') if m else None
        return [one, five, eight, four, five, six, seven, eight, nigh]
        pass

    def get_tables(self, url):
        resp = self.reqst.get(url, proxies=self.proxies, timeout=120)
        if resp.status_code == 200:
            return BeautifulSoup(resp.content, 'html.parser').find_all('table')
            #return [table for table in tables] #if (table.find_all('th') or table.find_all('a')) ]
    def get_head_ths_tds(self, table):
        head = table.find_all('th')[0].get_text().strip().split(
            '\n')[0].strip()
        allths = [
            th.get_text().strip() for th in table.find_all('th')[1:]
            if th.get_text()
        ]
        if head == u'股东信息' or head == u'发起人信息' or head == u'股东（发起人）信息' or head == u'行政许可信息' or head == u'股权出质登记信息':
            tdlist = []
            for td in table.find_all('td'):
                if td.find_all('a'):
                    tddict = {}
                    detail_head, detail_allths, detail_alltds = self.get_head_ths_tds(
                        self.get_tables(td.a['href'])[0])
                    if detail_head == u'股东及出资信息':
                        detail_content = self.reqst.get(td.a['href'],
                                                        proxies=self.proxies,
                                                        timeout=120).content
                        detail_alltds = self.get_re_list_from_content(
                            detail_content)
                        # print '---------------------------', len(detail_allths[:3]+detail_allths[5:]), len(detail_alltds)
                        # tddict = self.get_one_to_one_dict(detail_allths[:3]+detail_allths[5:], detail_alltds)
                        detail_allths = detail_allths[:3] + detail_allths[5:]
                        # self.test_print_all_ths_tds(detail_head, detail_allths, detail_alltds)
                        son_need_dict = {}
                        for key, value in zip(detail_allths[3:],
                                              detail_alltds[3:]):
                            son_need_dict[key] = value
                        need_dict = {}
                        for key, value in zip(detail_allths[:3],
                                              detail_alltds[:3]):
                            need_dict[key] = value
                        need_dict['list'] = [son_need_dict]
                        tdlist.append({detail_head: [need_dict]})

                        # tdlist.append(tddict)
                    else:
                        tddict = self.get_one_to_one_dict(
                            detail_allths, detail_alltds)
                        tdlist.append(tddict)
                elif td.get_text():
                    tdlist.append(td.get_text().strip())
                else:
                    tdlist.append(None)
            return head, allths, tdlist
            pass
        # elif head == u'股东及出资信息（币种与注册资本一致）' or head == u'股东及出资信息':
        # 	pass
        elif head == u'企业年报':
            tdlist = []
            for td in table.find_all('td'):
                if td.find_all('a'):
                    tddict = {}
                    for i, table in enumerate(self.get_tables(td.a['href'])):
                        enter_head, enter_allths, enter_alltds = self.get_head_ths_tds(
                            table)
                        #print enter_head
                        if i == 0:
                            enter_head = enter_allths[0]
                            enter_allths = enter_allths[1:]
                        #self.test_print_all_ths_tds(enter_head, enter_allths, enter_alltds)
                        tddict[enter_head] = self.get_one_to_one_dict(
                            enter_allths, enter_alltds)
                        if enter_head == u'企业基本信息' or enter_head == u'企业资产状况信息':
                            tddict[enter_head] = self.get_one_to_one_dict(
                                enter_allths, enter_alltds)[0]
                    tdlist.append(td.get_text().strip())
                    tdlist.append(tddict)
                elif td.get_text():
                    tdlist.append(td.get_text().strip())
                else:
                    tdlist.append(None)
            allths.insert(2, u'详情')
            # self.test_print_all_ths_tds(head, allths, tdlist)
            return head, allths, tdlist
            pass
        else:
            if table.find_all('td'):
                alltds = [
                    td.get_text().strip() if td.get_text() else None
                    for td in table.find_all('td')
                ]
            else:
                alltds = [None for th in allths]
                # alltds = []
            if head == u'主要人员信息':
                return head, allths[:int(len(allths) / 2)], alltds
            else:
                return head, allths, alltds
        #return (table.find_all('th')[0].get_text().strip().split('\n')[0].strip(), [th.get_text().strip() for th in table.find_all('th')[1:] if th.get_text()], [td.get_text().strip() if td.get_text() else None for td in table.find_all('td')])

    def get_one_to_one_dict(self, allths, alltds):
        if len(allths) == len(alltds):
            if any(alltds):
                one_to_one_dict = {}
                for key, value in zip(allths, alltds):
                    one_to_one_dict[key] = value
                return [one_to_one_dict]
            else:
                return []
        else:
            templist = []
            x = 0
            y = x + len(allths)
            #print '---------------------%d-------------------------------%d' % (len(allth), len(alltd))
            while y <= len(alltds):
                tempdict = {}
                for keys, values in zip(allths, alltds[x:y]):
                    tempdict[keys] = values
                x = y
                y = x + len(allths)
                templist.append(tempdict)
            return templist

    def test_print_table(self, tables):
        for table in tables:
            print table

    def test_print_all_ths_tds(self, head, allths, alltds):
        print '--------------', head, '--------------'
        for th in allths:
            print th
        for td in alltds:
            print td

    def test_print_all_dict(self, mydict):
        for key, value in mydict.items():
            print key, ':', value

    def get_json_one(self, mydict, tables):
        #self.test_print_table(tables)
        for table in tables:
            head, allths, alltds = self.get_head_ths_tds(table)
            #print head
            try:
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(
                    allths, alltds)
            except:
                pass
            if head == u'基本信息':
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(
                    allths, alltds)[0]
            if head == u'清算信息':
                if allths:
                    self.result_json_dict[
                        mydict[head]] = self.get_one_to_one_dict(
                            allths, alltds)
                else:
                    self.result_json_dict[mydict[head]] = []
            #self.test_print_all_ths_tds(head, allths, alltds)
        pass

    def get_json_two(self, mydict, tables):
        #self.test_print_table(tables)
        for table in tables:
            head, allths, alltds = self.get_head_ths_tds(table)
            #print head
            self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(
                allths, alltds)
        pass

    def get_json_three(self, mydict, tables):
        #self.test_print_table(tables)
        for table in tables:
            head, allths, alltds = self.get_head_ths_tds(table)
            #print head
            self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(
                allths, alltds)

        pass

    def get_json_four(self, mydict, tables):
        #self.test_print_table(tables)
        for table in tables:
            head, allths, alltds = self.get_head_ths_tds(table)
            #print head
            self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(
                allths, alltds)
        pass

    def run(self, findCode):

        self.ent_number = findCode

        id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
        print id_args
        if id_args and id_args.download_args.get('uuid'):
            self.result_json_dict = {}
            self.uuid = id_args.download_args['uuid']

            tableone = self.get_tables(self.uuid + '&tab=01')
            self.get_json_one(self.one_dict, tableone)
            tabletwo = self.get_tables(self.uuid + '&tab=02')
            self.get_json_two(self.two_dict, tabletwo)
            tablethree = self.get_tables(self.uuid + '&tab=03')
            self.get_json_three(self.three_dict, tablethree)
            tablefour = self.get_tables(self.uuid + '&tab=06')
            self.get_json_four(self.four_dict, tablefour)

            CrawlerUtils.json_dump_to_file(
                'yunnan.json', {self.ent_number: self.result_json_dict})
            print json.dumps({self.ent_number: self.result_json_dict})
            return [{self.ent_number: self.result_json_dict}]
        else:
            #创建目录
            html_restore_path = self.json_restore_path + '/yunnan/'
            if not os.path.exists(html_restore_path):
                os.makedirs(html_restore_path)

            self.uuid = self.get_id_num(findCode)
            if self.uuid is None:
                return json.dumps({self.ent_number: {}})
            self.result_json_dict_list = []
            for div in BeautifulSoup(self.after_crack_checkcode_page,
                                     'html.parser').find_all(
                                         'div', attrs={'class': 'list-item'}):
                hrefa = div.find_all('a', attrs={'target': '_blank'})[0]
                if hrefa:
                    self.uuid = hrefa['href'].split('&')[0]
                    self.enterprise_name = div.find_all(
                        'div', attrs={'class': 'link'})[0].get_text().strip()
                    self.ent_number = div.find_all(
                        'span')[0].get_text().strip()

                    args =  CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\
                       or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
                       or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
                    if args:
                        args.delete()
                    args = CrawlerDownloadArgs(
                        province='yunnan',
                        register_number=self.ent_number,
                        unifield_number=self.ent_number,
                        enterprise_name=self.enterprise_name,
                        download_args={'uuid': self.uuid})
                    args.save()
                else:
                    continue
                print self.uuid
                self.result_json_dict = {}

                tableone = self.get_tables(self.uuid + '&tab=01')
                self.get_json_one(self.one_dict, tableone)
                tabletwo = self.get_tables(self.uuid + '&tab=02')
                self.get_json_two(self.two_dict, tabletwo)
                tablethree = self.get_tables(self.uuid + '&tab=03')
                self.get_json_three(self.three_dict, tablethree)
                tablefour = self.get_tables(self.uuid + '&tab=06')
                self.get_json_four(self.four_dict, tablefour)

                CrawlerUtils.json_dump_to_file(
                    'yunnan.json', {self.ent_number: self.result_json_dict})
                print json.dumps({self.ent_number: self.result_json_dict})
                self.result_json_dict_list.append(
                    {self.ent_number: self.result_json_dict})
            return self.result_json_dict_list

Пример #21

Показать файл

Файл: jiangsu_crawler.py Проект: xiaohui2856/clawer

class JiangsuCrawler(Crawler):
    """江苏工商公示信息网页爬虫
    """
    #html数据的存储路径
    html_restore_path = settings.json_restore_path + '/jiangsu/'

    #验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/jiangsu/ckcode.jpg'
    code_cracker = CaptchaRecognition('jiangsu')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.jsgsj.gov.cn',
        'official_site':
        'http://www.jsgsj.gov.cn:58888/province/',
        'get_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/rand_img.jsp?type=7',
        'post_checkcode':
        'http://www.jsgsj.gov.cn:58888/province/infoQueryServlet.json?queryCinfo=true',
        'ind_comm_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfor_gsRelease.jsp',
        'ent_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qyRelease.jsp',
        'other_dept_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryCorpInfo_qtbmRelease.jsp',
        'judical_assist_pub_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_ci/ci_queryJudicialAssistance.jsp',
        'annual_report_skeleton':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/reportCheck/company/cPublic.jsp',
        'ci_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true',
        'common_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/commonServlet.json?commonEnter=true',
        'nb_enter':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/nbServlet.json?nbEnter=true',
        'ci_detail':
        'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciDetail=true'
    }

    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径，所有江苏的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
             需要在写入文件的时候加锁
        Returns:
        """
        self.proxies = Proxies().get_proxies()
        self.json_restore_path = json_restore_path

        self.parser = JiangsuParser(self)
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })
        self.corp_org = ''
        self.corp_id = ''
        self.corp_seq_id = ''
        self.common_enter_post_data = {}
        self.ci_enter_post_data = {}
        self.nb_enter_post_data = {}
        self.post_info = {
            'ind_comm_pub_reg_basic': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter',
                'specificQuery': 'basicInfo'
            },
            'ind_comm_pub_reg_shareholder': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'investmentInfor'
            },
            'ind_comm_pub_reg_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'biangeng'
            },
            'ind_comm_pub_arch_key_persons': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'personnelInformation'
            },
            'ind_comm_pub_arch_branch': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'branchOfficeInfor'
            },
            #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'},
            'ind_comm_pub_movable_property_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'dongchan'
            },
            'ind_comm_pub_equity_ownership_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'guquanchuzhi'
            },
            'ind_comm_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'chufa'
            },
            'ind_comm_pub_business_exception': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'abnormalInfor'
            },
            #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'},
            'ind_comm_pub_spot_check': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'checkup'
            },
            'ind_comm_pub_reg_shareholder_detail': {
                'url_type': 'ci_detail',
                'post_type': 'ci_detail',
                'specificQuery': 'investorInfor'
            },
            'ent_pub_annual_report': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_report_list'
            },
            'annual_report_detail': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter'
            },
            'ent_pub_shareholder_capital_contribution': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_tzcz'
            },
            'ent_pub_administrative_license': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzxk'
            },
            'ent_pub_knowledge_property': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_zscq'
            },
            'ent_pub_administration_sanction': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzcf'
            },
            'other_dept_pub_administration_license': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzheng'
            },
            'other_dept_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzhengchufa'
            },
            'judical_assist_pub_equity_freeze': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gqdjList'
            },
            'judical_assist_pub_shareholder_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gdbgList'
            }
        }

    def run(self, ent_number=0):
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        return Crawler.run(self, ent_number)
        '''
        self.ent_number = str(ent_number)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.json_dict = {}

        if not self.crawl_check_page():
            settings.logger.error('crack check code failed, stop to crawl enterprise %s' % self.ent_number)
            return False

        self.crawl_ind_comm_pub_pages()
        self.crawl_ent_pub_pages()
        self.crawl_other_dept_pub_pages()
        self.crawl_judical_assist_pub_pub_pages()

        #采用多线程，在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.json_dict})
        self.write_file_mutex.release()
        return True
        '''

    def crawl_check_page(self):
        """爬取验证码页面，包括下载验证码图片以及破解验证码
        :return true or false
        """
        resp = self.crawl_page_by_url(self.urls['official_site'])
        if not resp:
            logging.error("crawl the first page page failed!\n")
            return False
        count = 0
        while count < 15:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                logging.error("crawl checkcode failed! count number = %d\n" %
                              (count))
                continue
            data = {'name': self.ent_number, 'verifyCode': ckcode[1]}
            resp = self.crawl_page_by_url_post(self.urls['post_checkcode'],
                                               data=data)

            if resp.find("onclick") >= 0 and self.parse_post_check_page(resp):
                return True
            else:
                logging.error(
                    "crawl post check page failed! count number = %d\n" %
                    (count))
            time.sleep(random.uniform(5, 8))
        return False

    def get_page_data(self, page_name, real_post_data=None):
        """获取页面数据，通过页面名称，和post_data， 江苏的页面中几乎全部都是post方式来获取数据
        """
        url = self.urls[self.post_info[page_name].get('url_type')]
        logging.info('get %s, url:\n%s\n' % (page_name, url))
        if real_post_data:
            return self.get_pages(url, real_post_data)

        if self.post_info[page_name].get('post_type') == 'ci_enter':
            self.ci_enter_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_enter_post_data
        elif self.post_info[page_name].get(
                'post_type') == 'ci_enter_with_recordline':
            self.ci_enter_with_record_line_post_data[
                'specificQuery'] = self.post_info[page_name].get(
                    'specificQuery')
            post_data = self.ci_enter_with_record_line_post_data
        elif self.post_info[page_name].get('post_type') == 'common_enter':
            self.common_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.common_enter_post_data
        elif self.post_info[page_name].get('post_type') == 'ci_detail':
            self.ci_detail_post_data['specificQuery'] = self.post_info[
                page_name].get('specificQuery')
            post_data = self.ci_detail_post_data
        elif self.post_info[page_name].get('post_type') == 'nb_enter':
            self.nb_enter_post_data['propertiesName'] = self.post_info[
                page_name].get('propertiesName')
            post_data = self.nb_enter_post_data
        return self.get_pages(url, post_data)

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息
        """
        if not self.parser.ind_comm_pub_skeleton_built:
            page = self.crawl_skeleton_page('ind_comm_pub_skeleton')
            if not page:
                logging.error('crawl ind comm pub skeleton failed!')
                return False
            self.parser.parse_page('ind_comm_pub_skeleton', page)

        for item in (
                'ind_comm_pub_reg_basic',  # 登记信息-基本信息
                'ind_comm_pub_reg_shareholder',  # 股东信息
                'ind_comm_pub_reg_modify',
                'ind_comm_pub_arch_key_persons',  # 备案信息-主要人员信息
                'ind_comm_pub_arch_branch',  # 备案信息-分支机构信息
                #'ind_comm_pub_arch_liquidation', # 备案信息-清算信息, 网页中没有
                'ind_comm_pub_movable_property_reg',  # 动产抵押登记信息
                #'ind_comm_pub_equity_ownership_reg', # 股权出置登记信息
                'ind_comm_pub_administration_sanction',  # 行政处罚信息
                #'ind_comm_pub_business_exception',  # 经营异常信息 , 网页中不存在
                #'ind_comm_pub_serious_violate_law',  # 严重违法信息
                'ind_comm_pub_spot_check'):  # 抽查检查信息

            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息
        """
        if not self.parser.ent_pub_skeleton_built:
            page = self.crawl_skeleton_page('ent_pub_skeleton')
            if not page:
                logging.error('crawl ent pub skeleton failed!')
                return False
            self.parser.parse_page('ent_pub_skeleton', page)

        if not self.parser.annual_report_skeleton_built:
            page = self.crawl_skeleton_page('annual_report_skeleton')
            if not page:
                logging.error('crawl annual report skeleton failed!')
                return False
            self.parser.parse_page('annual_report_skeleton', page)

        for item in (
                'ent_pub_annual_report',
                #'ent_pub_shareholder_capital_contribution', #企业投资人出资比例
                #'ent_pub_equity_change', #股权变更信息
                'ent_pub_administrative_license',  #行政许可信息
                'ent_pub_knowledge_property',  #知识产权出资登记
                #'ent_pub_administration_sanction' #行政许可信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息
        """
        if not self.parser.other_dept_pub_skeleton_built:
            page = self.crawl_skeleton_page('other_dept_pub_skeleton')
            if not page:
                logging.error('crawl other dept pub skeleton failed!')
                return False
            self.parser.parse_page('other_dept_pub_skeleton', page)

        for item in (
                'other_dept_pub_administration_license',  #行政许可信息
                'other_dept_pub_administration_sanction'  #行政处罚信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def crawl_judical_assist_pub_pub_pages(self):
        """爬取司法协助信息
        """
        if not self.parser.judical_assist_pub_skeleton_built:
            page = self.crawl_skeleton_page('judical_assist_pub_skeleton')
            if not page:
                logging.error('crawl judical assist skeleton failed!')
                return False
            self.parser.parse_page('judical_assist_pub_skeleton', page)

        for item in (
                'judical_assist_pub_equity_freeze',  #股权冻结信息
                'judical_assist_pub_shareholder_modify'  #股东变更信息
        ):
            page_data = self.get_page_data(item)
            self.json_dict[item] = self.parser.parse_page(item, page_data)

    def get_pages(self, url, post_data):
        """获取网页数据
        Args:
            url: url地址
            post_data: post方式获取数据，返回的如果是一个列表，则将列表的所有元素都获得才返回
        Returns:
        """
        resp = self.crawl_page_by_url_post(url, data=post_data)
        if not resp:
            logging.error('get all pages of a section failed!')
            return
        else:
            json_obj = json.loads(resp)
            if type(json_obj) == dict and json_obj.get(
                    'total', None) and int(json_obj.get('total')) > 5:
                post_data['pageSize'] = json_obj.get('total')
                resp = self.crawl_page_by_url_post(url, data=post_data)
                if not resp:
                    logging.error('get all pages of a section failed!')
                    return
        return resp

    def crawl_skeleton_page(self, name):
        """爬取网页表格的框架页面，在江苏的网页中， 工商公示信息, 企业公示信息，其他部门公示信息，司法协助信息
        所有的tab页面中的表格结构都在一个最开始的页面中给出
        """
        url = self.urls[name]
        post_data = {
            'org': self.corp_org,
            'id': self.corp_id,
            'seq_id': self.corp_seq_id,
            'reg_no': self.ent_number,
            'name': self.ent_number,
            'containContextPath': 'ecipplatform',
            'corp_name': self.ent_number
        }
        resp = self.crawl_page_by_url_post(url, data=post_data)
        if not resp:
            logging.error('crawl %s page failed, error code.\n' % (name))
            return False
        return resp

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面，提取所需要的信息，比如corp id等
        Args:
            page: 提交验证码之后的页面
        """
        m = re.search(
            r'onclick=\\\"\w+\(\'([\w\./]+)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\',\'(\w*)\'\)',
            page)
        if m:
            self.corp_org = m.group(2)
            self.corp_id = m.group(3)
            self.corp_seq_id = m.group(4)
            self.common_enter_post_data = {
                'showRecordLine': '1',
                'specificQuery': 'commonQuery',
                'propertiesName': '',
                'corp_org': self.corp_org,
                'corp_id': self.corp_id,
                'pageNo': '1',
                'pageSize': '5'
            }
            self.ci_enter_post_data = {
                'org': self.corp_org,
                'id': self.corp_id,
                'seq_id': self.corp_seq_id,
                'specificQuery': ''
            }
            self.ci_enter_with_record_line_post_data = {
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'CORP_SEQ_ID': self.corp_seq_id,
                'specificQuery': '',
                'pageNo': '1',
                'pageSize': '5',
                'showRecordLine': '1'
            }
            self.ci_detail_post_data = {
                'ORG': self.corp_org,
                'ID': '',
                'CORP_ORG': self.corp_org,
                'CORP_ID': self.corp_id,
                'SEQ_ID': '',
                'REG_NO': self.ent_number,
                'specificQuery': ''
            }
            self.nb_enter_post_data = {
                'ID': '',
                'REG_NO': self.ent_number,
                'showRecordLine': '0',
                'specificQuery': 'gs_pb',
                'propertiesName': '',
                'pageNo': '1',
                'pageSize': '5',
                'ADMIT_MAIN': '08'
            }
            return True
        return False

    def crack_checkcode(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.crawl_page_by_url(self.urls['get_checkcode'])
        if not resp:
            logging.error('Failed, exception occured when getting checkcode')
            return ('', '')
        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        ckcode = ('', '')
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode

    def crawl_page_by_url(self, url):
        """根据url直接爬取页面
        """
        try:
            resp = self.reqst.get(url, proxies=self.proxies)
            if resp.status_code != 200:
                logging.error('crawl page by url failed! url = %s' % url)
            page = resp.content
            time.sleep(random.uniform(0.2, 1))
            # if saveingtml:
            #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
            return page
        except Exception as e:
            logging.error("crawl page by url exception %s" % (type(e)))

        return None

    def crawl_page_by_url_post(self, url, data):
        """ 根据url和post数据爬取页面
        """
        r = self.reqst.post(url, data, proxies=self.proxies)
        time.sleep(random.uniform(0.2, 1))
        if r.status_code != 200:
            logging.error(
                u"Getting page by url with post:%s\n, return status %s\n" %
                (url, r.status_code))
            return False
        return r.content

    def get_annual_report_detail(self, report_year, report_id):
        """获取企业年报的详细信息
        """
        annual_report_detail = {}
        post_data = self.nb_enter_post_data
        post_data['ID'] = report_id
        post_data['showRecordLine'] = '0'
        post_data['OPERATE_TYPE'] = '2'
        post_data['propertiesName'] = 'query_basicInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'企业基本信息'] = self.parser.parse_page(
            'annual_report_ent_basic_info', page_data)
        annual_report_detail[u'企业资产状况信息'] = self.parser.parse_page(
            'annual_report_ent_property_info', page_data)

        post_data['showRecordLine'] = '1'
        post_data['propertiesName'] = 'query_websiteInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'网站或网店信息'] = self.parser.parse_page(
            'annual_report_web_info', page_data)

        post_data['propertiesName'] = 'query_investInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外投资信息'] = self.parser.parse_page(
            'annual_report_investment_abord_info', page_data)

        post_data['MAIN_ID'] = report_id
        post_data['OPERATE_TYPE'] = '1'
        post_data['TYPE'] = 'NZGS'
        post_data['ADMIT_MAIN'] = '08'
        post_data['propertiesName'] = 'query_stockInfo'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'股东及出资信息'] = self.parser.parse_page(
            'annual_report_shareholder_info', page_data)

        post_data['propertiesName'] = 'query_InformationSecurity'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'对外提供保证担保信息'] = self.parser.parse_page(
            'annual_report_external_guarantee_info', page_data)

        post_data['propertiesName'] = 'query_RevisionRecord'
        page_data = self.get_page_data('annual_report_detail', post_data)
        annual_report_detail[u'修改记录'] = self.parser.parse_page(
            'annual_report_modify_record', page_data)
        return annual_report_detail

Пример #22

Показать файл

Файл: guangdong_crawler.py Проект: xiaohui2856/clawer

class GuangdongClawer(object):

    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path=None):
        self.html_search = None
        self.html_showInfo = None
        self.Captcha = None
        self.CR = CaptchaRecognition("guangdong")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        self.ents = []
        self.main_host = ""
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.dir_restore_path = settings.json_restore_path + '/guangdong/'
        #self.json_restore_path = settings.json_restore_path + '/guangdong.json'
        #验证码图片的存储路径
        self.path_captcha = settings.json_restore_path + '/guangdong/ckcode.jpg'

    # 破解搜索页面
    def crawl_page_search(self, url):
        r = self.requests.get(url)
        if r.status_code != 200:
            logging.error(
                u"Something wrong when getting the url:%s , status_code=%d",
                url, r.status_code)
            return
        r.encoding = "utf-8"
        #logging.error("searchpage html :\n  %s", r.text)
        self.html_search = r.text

    #获得搜索结果展示页面
    def get_page_showInfo(self, url, datas):
        r = self.requests.post(url, data=datas)
        if r.status_code != 200:
            return False
        r.encoding = "utf-8"
        #logging.error("showInfo page html :\n  %s", r.text)
        self.html_showInfo = r.text

    #分析 展示页面， 获得搜索到的企业列表
    def analyze_showInfo(self):
        if self.html_showInfo is None:
            logging.error(u"Getting Page ShowInfo failed\n")
        # call Object Analyze's method
        Ent = []
        soup = BeautifulSoup(self.html_showInfo, "html5lib")
        divs = soup.find_all("div", {"class": "list"})
        if divs:
            for div in divs:
                logging.error(u"div.ul.li.a['href'] = %s\n",
                              div.ul.li.a['href'])
                Ent.append(div.ul.li.a['href'])
        self.ents = Ent

    # 破解验证码页面
    def crawl_page_captcha(self,
                           url_Captcha,
                           url_CheckCode,
                           url_showInfo,
                           textfield='440301102739085'):

        count = 0
        while True:
            count += 1
            r = self.requests.get(url_Captcha)
            if r.status_code != 200:
                logging.error(
                    u"Something wrong when getting the Captcha url:%s , status_code=%d",
                    url_Captcha, r.status_code)
                return
            self.Captcha = r.content
            if self.save_captcha():
                result = self.crack_captcha()
                #print result
                datas = {
                    'textfield': textfield,
                    'code': result,
                }
                response = self.get_check_response(url_CheckCode, datas)
                # response返回的json结果: {u'flag': u'1', u'textfield': u'H+kiIP4DWBtMJPckUI3U3Q=='}
                if response['flag'] == '1':
                    datas_showInfo = {
                        'textfield': response['textfield'],
                        'code': result
                    }
                    self.get_page_showInfo(url_showInfo, datas_showInfo)
                    break
                else:
                    logging.error(
                        u"crack ID: %s Captcha failed, the %d time(s)" %
                        (self.ent_num, count))
                    if count > 15:
                        logging.error(
                            u"ID: %s, crack Captcha failed after the %d times of trial"
                            % (textfield, count))
                        break
            time.sleep(random.uniform(1, 4))
        return

    #获得验证的结果信息
    def get_check_response(self, url, datas):
        r = self.requests.post(url, data=datas)
        if r.status_code != 200:
            return False
        #print r.json()
        return r.json()

    #调用函数，破解验证码图片并返回结果
    def crack_captcha(self):
        if os.path.exists(self.path_captcha) is False:
            logging.error(u"Captcha path is not found\n")
            return
        result = self.CR.predict_result(self.path_captcha)
        return result[1]
        #print result

    # 保存验证码图片
    def save_captcha(self):
        url_Captcha = self.path_captcha
        if self.Captcha is None:
            logging.error(u"Can not store Captcha: None\n")
            return False
        self.write_file_mutex.acquire()
        f = open(url_Captcha, 'w')
        try:
            f.write(self.Captcha)
        except IOError:
            logging.error("%s can not be written", url_Captcha)
        finally:
            f.close
        self.write_file_mutex.release()
        return True

    """
    The following functions are for main page
    """
    """ 1. iterate enterprises in ents
        2. for each ent: decide host so that choose functions by pattern
        3. for each pattern, iterate urls
        4. for each url, iterate item in tabs
    """

    def crawl_page_main(self):
        sub_json_dict = {}
        if not self.ents:
            logging.error(u"Get no search result\n")
        try:

            for ent in self.ents:
                #http://www.szcredit.com.cn/web/GSZJGSPT/ QyxyDetail.aspx?rid=acc04ef9ac0145ecb8c87dd5710c2f86
                #http://gsxt.gzaic.gov.cn/search/ search!entityShow?entityVo.pripid=440100100012003051400230
                #http://gsxt.gdgs.gov.cn/aiccips /GSpublicity/GSpublicityList.html?service=entInfo_+8/Z3ukM3JcWEfZvXVt+QiLPiIqemiEqqq4l7n9oAh/FI+v6zW/DL40+AV4Hja1y-dA+Hj5oOjXjQTgAhKSP1lA==
                #HOSTS =["www.szcredit.com.cn", "121.8.227.200:7001", "gsxt.gdgs.gov.cn/aiccips"]
                m = re.match('http', ent)
                if m is None:
                    ent = urls['host'] + ent[3:]
                logging.error(u"ent url:%s\n" % ent)
                for i, item in enumerate(HOSTS):
                    if ent.find(item) != -1:

                        #"www.szcredit.com.cn"
                        if i == 0:
                            logging.error(u"This %s enterprise is type 0" %
                                          (self.ent_num))
                            guangdong = Guangdong0(self.requests, self.ent_num)
                            sub_json_dict = guangdong.run(ent)
                        elif i == 1:
                            logging.error(u"This %s enterprise is type 1" %
                                          (self.ent_num))
                            guangdong = Guangdong1(self.requests)
                            sub_json_dict = guangdong.run(ent)
                        # gsxt.gdgs.gov.cn/aiccips
                        elif i == 2:
                            logging.error(u"This %s enterprise is type 2" %
                                          (self.ent_num))
                            guangdong = Guangdong2(self.requests)
                            sub_json_dict = guangdong.run(ent)
                        break
                else:
                    logging.error(u"There are no response hosts:%s\n" %
                                  self.ent_num)
        except Exception as e:
            logging.error(
                u"An error ocurred when getting the main page, error: %s" %
                type(e))
            raise e
        finally:
            return sub_json_dict

    def crawl_page_by_url(self, url):
        r = self.requests.get(url)
        if r.status_code != 200:
            logging.error(u"Getting page by url:%s\n, return status %s\n" %
                          (url, r.status_code))
            return False
        # 为了防止页面间接跳转，获取最终目标url
        return {'page': r.text, 'url': r.url}

    def crawl_page_by_url_post(self, url, data, header={}):
        if header:
            r = self.requests.post(url, data, headers=header)
        else:
            r = self.requests.post(url, data)
        if r.status_code != 200:
            logging.error(
                u"Getting page by url with post:%s\n, return status %s\n" %
                (url, r.status_code))
            return False
        return {'page': r.text, 'url': r.url}

    # main function
    def run(self, ent_num):
        if not os.path.exists(self.dir_restore_path):
            os.makedirs(self.dir_restore_path)
        json_dict = {}
        self.ent_num = str(ent_num)
        logging.error('crawl ID: %s\n' % ent_num)
        self.crawl_page_search(urls['page_search'])
        self.crawl_page_captcha(urls['page_Captcha'], urls['checkcode'],
                                urls['page_showinfo'], ent_num)
        self.analyze_showInfo()
        data = self.crawl_page_main()
        json_dict[ent_num] = data
        return json.dumps(json_dict)

Пример #23

Показать файл

Файл: hebei_crawler.py Проект: xiaohui2856/clawer

class HebeiCrawler(object):
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path):
        self.CR = CaptchaRecognition("hebei")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        self.ents = []
        self.json_restore_path = json_restore_path
        self.csrf = ""
        #验证码图片的存储路径
        self.path_captcha = settings.json_restore_path + '/hebei/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = settings.json_restore_path + '/hebei/'

    # 破解搜索页面
    def crawl_page_search(self, url):
        r = self.requests.get( url)
        if r.status_code != 200:
            logging.error(u"Something wrong when getting the url:%s , status_code=%d", url, r.status_code)
            return
        r.encoding = "utf-8"
        #logging.debug("searchpage html :\n  %s", r.text)
        return r.text

    #分析 展示页面， 获得搜索到的企业列表
    def analyze_showInfo(self, page):
        Ent = []
        soup = BeautifulSoup(page, "html5lib")
        divs = soup.find_all("div", {"class":"list-item"})
        for div in divs:
            Ent.append(div.find('a')['href'])
        self.ents = Ent

    def crawl_page_captcha(self, url_search, url_Captcha, url_CheckCode,url_showInfo,  textfield= '130000000021709'):
        """破解验证码页面"""
        html_search = self.crawl_page_search(url_search)
        if not html_search:
            logging.error(u"There is no search page")
            return
        soup = BeautifulSoup(html_search, 'html5lib')
        form = soup.find('form', {'id':'formInfo'})
        datas= {
                #'searchType' : 1,
                'captcha': None,
                'session.token': form.find('input',{'name': 'session.token'})['value'],
                #'condition.keyword': textfield,
        }
        count = 0
        while True:
            count+= 1
            r = self.requests.get( url_Captcha+ str(random.random()))
            if r.status_code != 200:
                logging.error(u"Something wrong when getting the Captcha url:%s , status_code=%d", url_Captcha+ str(random.random()), r.status_code)
                return
            #logging.debug("Captcha page html :\n  %s", self.Captcha)
            if self.save_captcha(r.content):
                logging.info("Captcha is saved successfully \n" )
                datas['captcha'] = self.crack_captcha()
                logging.info("cracked captcha is %d"%(datas['captcha']) )
                res=  self.crawl_page_by_url_post(url_CheckCode, datas)['page']
                # 如果验证码正确，就返回一种页面，否则返回主页面
                if str(res) is not '0' :
                    datas['searchType'] = 1
                    datas['condition.keyword'] = textfield
                    page =  self.crawl_page_by_url_post(url_showInfo, datas)['page']
                    self.analyze_showInfo(page)
                    break
                else:
                    logging.debug(u"crack Captcha failed, the %d time(s)", count)
                    if count > 15:
                        break
        return
    def crack_captcha(self):
        """调用函数，破解验证码图片并返回结果"""
        if os.path.exists(self.path_captcha) is False:
            logging.error(u"Captcha path is not found\n")
            return
        result = self.CR.predict_result(self.path_captcha)
        return result[1]
        #print result
    def save_captcha(self, Captcha):
        """保存验证码图片"""
        url_Captcha = self.path_captcha
        if Captcha is None:
            logging.error(u"Can not store Captcha: None\n")
            return False
        self.write_file_mutex.acquire()
        f = open(url_Captcha, 'w')
        try:
            f.write(Captcha)
        except IOError:
            logging.debug("%s can not be written", url_Captcha)
        finally:
            f.close
        self.write_file_mutex.release()
        return True
    """
        The following enterprises in ents
        1. for each ent: decide host so that choose e urls
        2. for eah url, iterate item in tabs
    """
    def crawl_page_main(self ):
        """  爬取页面信息总函数        """
        sub_json_dict= {}
        if not self.ents:
            logging.error(u"Get no search result\n")
        try:
            for ent in self.ents:
                m = re.match('http', ent)
                if m is None:
                    ent = urls['host']+ ent
                logging.info(u"crawl main url:%s"% ent)
                #工商公示信息
                url = ent
                sub_json_dict.update(self.crawl_ind_comm_pub_pages(url))
                url = url[:-2]+"02"
                sub_json_dict.update(self.crawl_ent_pub_pages(url))
                url = url[:-2] + "03"
                sub_json_dict.update(self.crawl_other_dept_pub_pages(url))
                url = url[:-2] + "06"
                sub_json_dict.update(self.crawl_judical_assist_pub_pages(url))

        except Exception as e:
            logging.error(u"An error ocurred when getting the main page, error: %s"% type(e))
            raise e
        finally:
            return sub_json_dict
    #工商公式信息页面
    def crawl_ind_comm_pub_pages(self, url):
        """  爬取 工商公式 信息页面        """
        sub_json_dict={}
        try:
            #page = html_from_file('next.html')
            logging.info( u"crawl the crawl_ind_comm_pub_pages page %s."%(url))
            page = self.crawl_page_by_url(url)['page']
            #html_to_file('next.html', page)
            dj = self.parse_page(page ) # class= result-table
            sub_json_dict['ind_comm_pub_reg_basic'] = dj[u'基本信息'] if dj.has_key(u'基本信息') else []        # 登记信息-基本信息
            sub_json_dict['ind_comm_pub_reg_shareholder'] =dj[u'股东信息'] if dj.has_key(u'股东信息') else []   # 股东信息
            sub_json_dict['ind_comm_pub_reg_modify'] =  dj[u'变更信息'] if dj.has_key(u'变更信息') else []      # 变更信息
            sub_json_dict['ind_comm_pub_arch_key_persons'] = dj[u'主要人员信息'] if dj.has_key(u'主要人员信息') else []   # 备案信息-主要人员信息
            sub_json_dict['ind_comm_pub_arch_branch'] = dj[u'分支机构信息'] if dj.has_key(u'分支机构信息') else []       # 备案信息-分支机构信息
            sub_json_dict['ind_comm_pub_arch_liquidation'] = dj[u'清算信息'] if dj.has_key(u'清算信息') else []   # 备案信息-清算信息
            sub_json_dict['ind_comm_pub_movable_property_reg'] = dj[u'动产抵押登记信息'] if dj.has_key(u'动产抵押登记信息') else []
            sub_json_dict['ind_comm_pub_equity_ownership_reg'] = dj[u'股权出质登记信息'] if dj.has_key(u'股权出质登记信息') else []
            sub_json_dict['ind_comm_pub_administration_sanction'] = dj[u'行政处罚信息'] if dj.has_key(u'行政处罚信息') else []
            sub_json_dict['ind_comm_pub_business_exception'] = dj[u'经营异常信息'] if dj.has_key(u'经营异常信息') else []
            sub_json_dict['ind_comm_pub_serious_violate_law'] = dj[u'严重违法信息'] if dj.has_key(u'严重违法信息') else []
            sub_json_dict['ind_comm_pub_spot_check'] = dj[u'抽查检查信息'] if dj.has_key(u'抽查检查信息') else []
        except Exception as e:
            logging.debug(u"An error ocurred in crawl_ind_comm_pub_pages: %s"% type(e))
            raise e
        finally:
            return sub_json_dict
    #爬取 企业公示信息 页面
    def crawl_ent_pub_pages(self, url):
        """  爬取 企业公示信息 信息页面        """
        sub_json_dict = {}
        try:
            logging.info( u"crawl the crawl_ent_pub_pages page %s"%(url))
            page = self.crawl_page_by_url(url)['page']
            #html_to_file('next.html', page)
            #page = html_from_file('next.html')
            p = self.parse_page(page)
            sub_json_dict['ent_pub_ent_annual_report'] = p[u'企业年报'] if p.has_key(u'企业年报') else []
            sub_json_dict['ent_pub_administration_license'] = p[u'行政许可信息'] if p.has_key(u'行政许可信息') else []
            sub_json_dict['ent_pub_administration_sanction'] = p[u'行政处罚信息'] if p.has_key(u'行政处罚信息') else []
            sub_json_dict['ent_pub_shareholder_capital_contribution'] = p[u'股东及出资信息（币种与注册资本一致）'] if p.has_key(u'股东及出资信息（币种与注册资本一致）') else []
            sub_json_dict['ent_pub_reg_modify'] = p[u'变更信息'] if p.has_key(u'变更信息') else []
            sub_json_dict['ent_pub_equity_change'] = p[u'股权变更信息'] if p.has_key(u'股权变更信息') else []
            sub_json_dict['ent_pub_knowledge_property'] = p[u'知识产权出质登记信息'] if p.has_key(u'知识产权出质登记信息') else []
        except Exception as e:
            logging.debug(u"An error ocurred in crawl_ent_pub_pages: %s"% type(e))
            raise e
        finally:
            return sub_json_dict
    #爬取 其他部门公示 页面
    def crawl_other_dept_pub_pages(self, url):
        """  爬取其他部门信息页面        """
        sub_json_dict = {}
        try:
            logging.info( u"crawl the crawl_other_dept_pub_pages page %s."%(url))
            page = self.crawl_page_by_url(url)['page']
            #html_to_file('next.html', page)
            #page = html_from_file('next.html')
            xk = self.parse_page(page)#行政许可信息
            sub_json_dict["other_dept_pub_administration_license"] =  xk[u'行政许可信息'] if xk.has_key(u'行政许可信息') else []
            sub_json_dict["other_dept_pub_administration_sanction"] = xk[u'行政处罚信息'] if xk.has_key(u'行政处罚信息') else []  # 行政处罚信息
        except Exception as e:
            logging.debug(u"An error ocurred in crawl_other_dept_pub_pages: %s"% (type(e)))
            raise e
        finally:
            return sub_json_dict

    def crawl_judical_assist_pub_pages(self, url):
        """爬取司法协助信息页面 """
        sub_json_dict = {}
        try:
            logging.info( u"crawl the crawl_judical_assist_pub_pages page %s."%(url))
            page = self.crawl_page_by_url(url)['page']
            #page = html_from_file('next.html')
            #html_to_file('next.html', page)
            xz = self.parse_page(page)
            sub_json_dict['judical_assist_pub_equity_freeze'] = xz[u'司法股权冻结信息'] if xz.has_key(u'司法股权冻结信息') else []
            sub_json_dict['judical_assist_pub_shareholder_modify'] = xz[u'司法股东变更登记信息'] if xz.has_key(u'司法股东变更登记信息') else []
        except Exception as e:
            logging.debug(u"An error ocurred in crawl_judical_assist_pub_pages: %s"% (type(e)))
            raise e
        finally:
            return sub_json_dict

    # 出资方式字典
    def dicInvtType(self, types):
        if types == "1":
            return  "货币"
        if types == "2":
            return  "实物"
        if types == "3":
            return  "知识产权"
        if types == "4":
            return  "债权"
        if types == "5":
            return  "高新技术成果"
        if types == "6":
            return  "土地使用权"
        if types == "7":
            return  "股权"
        if types == "8":
            return  "劳务"
        if types == "9":
            return  "其他"

    def get_raw_text_by_tag(self, tag):
        return tag.get_text().strip()

    def get_table_title(self, table_tag):
        if table_tag.find('tr'):
            if table_tag.find('tr').find_all('th')  :
                if len(table_tag.find('tr').find_all('th')) > 1 :
                    return None
                # 处理 <th> aa<span> bb</span> </th>
                if table_tag.find('tr').th.stirng == None and len(table_tag.find('tr').th.contents) > 1:
                    # 处理 <th>   <span> bb</span> </th>  包含空格的
                    if (table_tag.find('tr').th.contents[0]).strip()  :
                        return (table_tag.find('tr').th.contents[0]).strip()
                # <th><span> bb</span> </th>
                return self.get_raw_text_by_tag(table_tag.find('tr').th)
        return None

    def sub_column_count(self, th_tag):
        if th_tag.has_attr('colspan') and th_tag.get('colspan') > 1:
            return int(th_tag.get('colspan'))
        return 0

    def get_sub_columns(self, tr_tag, index, count):
        columns = []
        for i in range(index, index + count):
            th = tr_tag.find_all('th')[i]
            if not self.sub_column_count(th):
                columns.append(( self.get_raw_text_by_tag(th), self.get_raw_text_by_tag(th)))
            else:
            #if has sub-sub columns
                columns.append((self.get_raw_text_by_tag(th), self.get_sub_columns(tr_tag.nextSibling.nextSibling, 0, self.sub_column_count(th))))
        return columns

    #get column data recursively, use recursive because there may be table in table
    def get_column_data(self, columns, td_tag):
        if type(columns) == list:
            data = {}
            multi_col_tag = td_tag
            if td_tag.find('table'):
                multi_col_tag = td_tag.find('table').find('tr')
            if not multi_col_tag:
                logging.error('invalid multi_col_tag, multi_col_tag = %s', multi_col_tag)
                return data

            if len(columns) != len(multi_col_tag.find_all('td', recursive=False)):
                logging.error('column head size != column data size, columns head = %s, columns data = %s' % (columns, multi_col_tag.contents))
                return data

            for id, col in enumerate(columns):
                data[col[0]] = self.get_column_data(col[1], multi_col_tag.find_all('td', recursive=False)[id])
            return data
        else:
            return self.get_raw_text_by_tag(td_tag)


    def get_detail_link(self, bs4_tag):
        if bs4_tag.has_attr('href') and (bs4_tag['href'] != '#' and bs4_tag['href'] != 'javascript:void(0);'):
            pattern = re.compile(r'http')
            if pattern.search(bs4_tag['href']):
                return bs4_tag['href']
            return urls['webroot'] + bs4_tag['href']
        elif bs4_tag.has_attr('onclick'):
            #print 'onclick'
            logging.error(u"onclick attr was found in detail link")
        return None


    def get_columns_of_record_table(self, bs_table, page, table_name):
        tbody = None
        if len(bs_table.find_all('tbody')) > 1:
            tbody= bs_table.find_all('tbody')[0]
        else:
            tbody = bs_table.find('tbody') or BeautifulSoup(page, 'html5lib').find('tbody')

        tr = None
        if tbody:
            if len(tbody.find_all('tr')) <= 1:
                tr = tbody.find('tr')
            else:
                tr = tbody.find_all('tr')[1]
                if not tr.find('th'):
                    tr = tbody.find_all('tr')[0]
                elif tr.find('td'):
                    tr = None
        else:
            if len(bs_table.find_all('tr')) <= 1:
                return None
            elif bs_table.find_all('tr')[0].find('th') and not bs_table.find_all('tr')[0].find('td') and len(bs_table.find_all('tr')[0].find_all('th')) > 1:
                tr = bs_table.find_all('tr')[0]
            elif bs_table.find_all('tr')[1].find('th') and not bs_table.find_all('tr')[1].find('td') and len(bs_table.find_all('tr')[1].find_all('th')) > 1:
                tr = bs_table.find_all('tr')[1]
        ret_val=  self.get_record_table_columns_by_tr(tr, table_name)
        #logging.debug(u"ret_val->%s\n", ret_val)
        return  ret_val

    def get_record_table_columns_by_tr(self, tr_tag, table_name):
        columns = []
        if not tr_tag:
            return columns
        try:
            sub_col_index = 0
            if len(tr_tag.find_all('th'))==0 :
                logging.error(u"The table %s has no columns"% table_name)
                return columns
            count = 0
            if len(tr_tag.find_all('th'))>0 :
                for th in tr_tag.find_all('th'):
                    #logging.debug(u"th in get_record_table_columns_by_tr =\n %s", th)
                    col_name = self.get_raw_text_by_tag(th)
                    if col_name :
                        if ((col_name, col_name) in columns) :
                            col_name= col_name+'_'
                            count+=1
                        if not self.sub_column_count(th):
                            columns.append((col_name, col_name))
                        else: #has sub_columns
                            columns.append((col_name, self.get_sub_columns(tr_tag.nextSibling.nextSibling, sub_col_index, self.sub_column_count(th))))
                            sub_col_index += self.sub_column_count(th)
                if count == len(tr_tag.find_all('th'))/2:
                    columns= columns[: len(columns)/2]
        except Exception as e:
            logging.error(u'exception occured in get_table_columns, except_type = %s, table_name = %s' % (type(e), table_name))
        finally:
            return columns

    # 分析企业年报详细页面
    def parse_ent_pub_annual_report_page(self, page):
        sub_dict = {}
        try:
            soup = BeautifulSoup(page, 'html5lib')
            # 基本信息表包含两个表头, 需要单独处理
            basic_table = soup.find('table')
            trs = basic_table.find_all('tr')
            title = self.get_raw_text_by_tag(trs[1].th)
            table_dict = {}
            for tr in trs[2:]:
                if tr.find('th') and tr.find('td'):
                    ths = tr.find_all('th')
                    tds = tr.find_all('td')
                    if len(ths) != len(tds):
                        logging.error(u'th size not equals td size in table %s, what\'s up??' % table_name)
                        return
                    else:
                        for i in range(len(ths)):
                            if self.get_raw_text_by_tag(ths[i]):
                                table_dict[self.get_raw_text_by_tag(ths[i])] = self.get_raw_text_by_tag(tds[i])
            sub_dict[title] = table_dict

            content_table = soup.find_all('table')[1:]
            for table in content_table:
                table_name = self.get_table_title(table)
                if table_name:
                    sub_dict[table_name] = self.parse_table(table, table_name, page)
        except Exception as e:
            logging.error(u'annual page: fail to get table data with exception %s' % e)
            raise e
        finally:
            return sub_dict
    #股东及出资信息（币种与注册资本一致）
    def parse_table_qygs_gudongchuzi(self, page):
        coms = re.findall(r'var investor.*?list.push\(investor\);', page, flags=re.DOTALL+re.MULTILINE)
        sub_item={}
        item = {}
        Item = []
        for comstr in coms:
            m_invstr = re.compile(r'investor.inv.*?;').search(comstr)
            if m_invstr:
                invstr = m_invstr.group()
                inv = re.compile(r'\".*?\"').search(invstr).group().strip('\"')
                #认缴
                rjSubConAmlist=[]
                count_rj = 0
                for itemstr in re.findall(r'invt.subConAm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    subConAm = eval(re.compile(r"\".*?\"").search(itemstr).group().strip('\"'))
                    count_rj += subConAm

                    rjSubConAmlist.append(subConAm)
                rjconDateList=[]
                for itemstr in  re.findall(r'invt.conDate.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    conDate = (re.compile(r"\'.*?\'").search(itemstr).group().strip("\'"))
                    rjconDateList.append(conDate)
                rjconFormList=[]
                for itemstr in re.findall(r'invt.conForm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    conForm = (re.compile(r"\".*?\"").search(itemstr).group().strip('\"'))
                    rjconFormList.append(conForm)
                #实缴
                sjAcConAm=[]
                count_sj = 0
                for itemstr in re.findall(r'invtActl.acConAm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    acConAm = eval(re.compile(r"\".*?\"").search(itemstr).group().strip('\"'))
                    count_sj += acConAm
                    sjAcConAm.append(acConAm)
                sjconDateList = []
                for itemstr in re.findall(r'invtActl.conDate.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    conDate = (re.compile(r"\'.*?\'").search(itemstr).group().strip("\'"))
                    sjconDateList.append(conDate)
                sjconFormList = []
                for itemstr in re.findall(r'invtActl.conForm.*?;', comstr, flags = re.DOTALL+ re.MULTILINE):
                    conForm = (re.compile(r"\".*?\"").search(itemstr).group().strip('\"'))
                    sjconFormList.append(conForm)
                len_rj = len(rjSubConAmlist)
                len_sj = len(sjAcConAm)
                item = {}
                item_list = []
                item[u'股东'] = inv
                item[u'认缴额（万元）'] = count_rj
                item[u'实缴额（万元）'] = count_sj
                try:
                    maxRow = max( len_sj,  len_rj)
                    for i in xrange(maxRow):
                        sub_item={}
                        if i < len_rj:
                            sub_item[u'认缴出资方式'] = self.dicInvtType(rjconFormList[i])
                            sub_item[u'认缴出资额（万元）'] = rjSubConAmlist[i]
                            sub_item[u'认缴出资日期'] =  rjconDateList[i]
                        else:
                            sub_item[u'认缴出资方式'] =""
                            sub_item[u'认缴出资额（万元）'] =""
                            sub_item[u'认缴出资日期'] = ""
                        #item[u'认缴明细'] = sub_item
                        if i< len_sj:
                            sub_item[u'实缴出资方式'] = self.dicInvtType(sjconFormList[i])
                            sub_item[u'实缴出资额（万元'] = sjAcConAm[i]
                            sub_item[u'实缴出资日期'] =  sjconDateList[i]
                        else:
                            sub_item[u'实缴出资方式'] =""
                            sub_item[u'实缴出资额（万元'] =""
                            sub_item[u'实缴出资日期'] = ""
                        #item[u'实缴明细'] = sub_item
                        item_list.append(sub_item)
                except Exception as e:
                    logging.error(u"exception : %s"%(type(e)))
                item[u'详情'] = item_list
                Item.append(item)
            else:
                logging.error(u"There is no company, continue!")
        return Item

    def parse_page(self, page, div_id='cont-r-b'):
        soup = BeautifulSoup(page, 'html5lib')
        page_data = {}

        try:
            div = soup.find('div', attrs = {'id':div_id})
            if div:
                tables = div.find_all('table')
            else:
                tables = soup.find_all('table')
            #print table
            for table in tables:
                table_name = self.get_table_title(table)
                if table_name:
                    if table_name == u"股东及出资信息（币种与注册资本一致）":
                        page_data[table_name ] =self.parse_table_qygs_gudongchuzi(page)
                    else:
                        page_data[table_name] = self.parse_table(table, table_name, page)
        except Exception as e:
            logging.error(u'parse page failed, with exception %s' % e)
            raise e
        finally:
            return page_data

    def parse_table(self, bs_table, table_name, page):
        table_dict = None
        try:
            # tb_title = self.get_table_title(bs_table)
            #this is a f*****g dog case, we can't find tbody-tag in table-tag, but we can see tbody-tag in table-tag
            #in case of that, we use the whole html page to locate the tbody
            print table_name
            columns = self.get_columns_of_record_table(bs_table, page, table_name)
            #print columns
            tbody = None
            if len(bs_table.find_all('tbody'))>1:
                tbody = bs_table.find_all('tbody')[1]
            else:
                tbody = bs_table.find('tbody') or BeautifulSoup(page, 'html5lib').find('tbody')
            if columns:
                col_span = 0
                single_col = 0
                for col in columns:
                    if type(col[1]) == list:
                        col_span += len(col[1])
                    else:
                        single_col+=1
                        col_span += 1

                column_size = len(columns)
                item_array = []
                if not tbody:
                    records_tag = bs_table
                else:
                    records_tag = tbody
                item = None
                for tr in records_tag.find_all('tr'):
                    if tr.find_all('td') and len(tr.find_all('td', recursive=False)) % column_size == 0:
                        col_count = 0
                        item = {}
                        for td in tr.find_all('td',recursive=False):
                            if td.find('a'):
                                #try to retrieve detail link from page
                                next_url = self.get_detail_link(td.find('a'))
                                logging.info(u'crawl detail url: %s'% next_url)
                                if next_url:
                                    detail_page = self.crawl_page_by_url(next_url)
                                    #html_to_file("test.html", detail_page['page'])
                                    #print "table_name : "+ table_name
                                    if table_name == u'企业年报':
                                        #logging.debug(u"next_url = %s, table_name= %s\n", detail_page['url'], table_name)
                                        page_data = self.parse_ent_pub_annual_report_page(detail_page['page'])

                                        item[columns[col_count][0]] = page_data #this may be a detail page data
                                    else:
                                        page_data = self.parse_page(detail_page['page'])
                                        item[columns[col_count][0]] = page_data #this may be a detail page data
                                else:
                                    #item[columns[col_count]] = CrawlerUtils.get_raw_text_in_bstag(td)
                                    item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td)
                            else:
                                item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td)
                            col_count += 1
                            if col_count == column_size:
                                item_array.append(item.copy())
                                col_count = 0
                    #this case is for the ind-comm-pub-reg-shareholders----details'table
                    #a f*****g dog case!!!!!!
                    elif tr.find_all('td') and len(tr.find_all('td', recursive=False)) == col_span and col_span != column_size:
                        col_count = 0
                        sub_col_index = 0
                        item = {}
                        sub_item = {}
                        for td in tr.find_all('td',recursive=False):
                            if type(columns[col_count][1]) == list:
                                sub_key = columns[col_count][1][sub_col_index][1]
                                sub_item[sub_key] = self.get_raw_text_by_tag(td)
                                sub_col_index += 1
                                if sub_col_index == len(columns[col_count][1]):
                                    item[columns[col_count][0]] = sub_item.copy()
                                    sub_item = {}
                                    col_count += 1
                                    sub_col_index = 0
                            else:
                                item[columns[col_count][0]] = self.get_column_data(columns[col_count][1], td)
                                col_count += 1
                            if col_count == column_size:
                                item_array.append(item.copy())
                                col_count = 0
                table_dict = item_array
            else:
                table_dict = {}

                for tr in bs_table.find_all('tr'):
                    if tr.find('th') and tr.find('td'):
                        ths = tr.find_all('th')
                        tds = tr.find_all('td')
                        if len(ths) != len(tds):
                            logging.error(u'th size not equals td size in table %s, what\'s up??' % table_name)
                            return
                        else:
                            for i in range(len(ths)):
                                if self.get_raw_text_by_tag(ths[i]):
                                    table_dict[self.get_raw_text_by_tag(ths[i])] = self.get_raw_text_by_tag(tds[i])
        except Exception as e:
            logging.error(u'parse table %s failed with exception %s' % (table_name, type(e)))
            raise e
        finally:
            return table_dict


    def crawl_page_by_url(self, url):
        try:
            r = self.requests.get( url)
            if r.status_code != 200:
                logging.error(u"Getting page by url:%s, return status %s\n"% (url, r.status_code))
            text = r.text
            urls = r.url
            # 为了防止页面间接跳转，获取最终目标url
        except Exception as e:
            logging.error(u"Cann't get page by url:%s, exception is %s"%(url, type(e)))
        finally:
            return {'page' : text, 'url': urls}

    def crawl_page_by_url_post(self, url, data, headers={}):
        try:
            if headers:
                self.requests.headers.update(headers)
                r = self.requests.post(url, data)
            else :
                r = self.requests.post(url, data)
            if r.status_code != 200:
                logging.error(u"Getting page by url with post:%s, return status %s\n"% (url, r.status_code))
            text = r.text
            urls = r.url
        except Exception as e:
            logging.error(u"Cann't post page by url:%s, exception is %s"%(url, type(e)))
        finally:
            return {'page': text, 'url': urls}

    def run(self, ent_num):
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        json_dict = {}
        self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num)
        data = self.crawl_page_main()
        json_dict[ent_num] = data
        #json_dump_to_file(self.json_restore_path , json_dict)
        #2016-2-16
        return json.dumps(json_dict)

    def work(self, ent_num= ""):

        # if not os.path.exists(self.html_restore_path):
        #     os.makedirs(self.html_restore_path)
        self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'], urls['checkcode'], urls['page_showinfo'], ent_num)
        data = self.crawl_page_main()
        json_dump_to_file('hebei_json.json', data)

Пример #24

Показать файл

class ZongjuCrawler(Crawler):
    """总局工商爬虫
    """

    code_cracker = CaptchaRecognition('zongju')
    # 多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {'host': 'http://qyxy.saic.gov.cn',
            'official_site': 'http://gsxt.saic.gov.cn/zjgs/',
            'get_checkcode': 'http://gsxt.saic.gov.cn/zjgs/captcha?preset=',
            'post_checkcode':
            'http://gsxt.saic.gov.cn/zjgs/security/verify_captcha',
            'get_info_entry':
            'http://gsxt.saic.gov.cn/zjgs/search/ent_info_list',    # 获得企业入口
            'open_info_entry': 'http://gsxt.saic.gov.cn/zjgs/notice/view?',
    # 获得企业信息页面的url，通过指定不同的tab=1-4来选择不同的内容（工商公示，企业公示...）
            }

    def __init__(self, json_restore_path=None):
        super(ZongjuCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/zongju/'
        # 验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/zongju/ckcode.jpg'

        self.parser = ZongjuParser(self)
        self.proxies = get_proxy('beijing')
        self.timeout = (30, 20)

    def run(self, _ent):
        """爬取的主函数
        """

        # self.proxies = {'http':'http://123.121.30.123:8118'}
        if self.proxies:
            print self.proxies
            self.reqst.proxies = self.proxies
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        return Crawler.run(self, _ent)

    def crawl_check_page(self):
        """爬取验证码页面，包括获取验证码url，下载验证码图片，破解验证码并提交
        """
        count = 0
        next_url = self.urls['official_site']
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to get official site')
            return False
        if not self.parse_pre_check_page(resp.content):
            logging.error('failed to parse pre check page')
            return False

        while count < 30:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                continue
            post_data = {'captcha': ckcode[1], 'session.token': self.session_token}
            next_url = self.urls['post_checkcode']
            resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout, verify=False)
            if resp.status_code != 200:
                logging.error('failed to get crackcode image by url %s, fail count = %d' % (next_url, count))
                continue

            logging.error('crack code = %s, %s, response =  %s' % (ckcode[0], ckcode[1], resp.content))
            if resp.content == '0':
                logging.error('crack checkcode failed!count = %d' % (count))
                continue

            next_url = self.urls['get_info_entry']
            post_data = {
                'searchType': '1',
                'captcha': ckcode[1],
                'session.token': self.session_token,
                'condition.keyword': self._ent
            }

            resp = self.reqst.post(next_url, data=post_data, timeout=self.timeout)

            if resp.status_code != 200:
                logging.error('faild to crawl url %s' % next_url)
                return False

            if self.parse_post_check_page(resp.content):
                return True
            logging.error('crack checkcode failed, total fail count = %d' % count)
            print('crack checkcode failed!count = %d' % (count))
            time.sleep(random.uniform(1, 3))

        return False

    @exe_time
    def crawl_ind_comm_pub_pages(self, *args, **kwargs):
        """爬取工商公示信息页面
        在总局的网站中，工商公示信息在一个页面中返回。页面中包含了多个表格，调用 Parser的 parse_ind_comm_page进行解析
        在 Parser的ind_comm_pub_page 中，访问并设置 crawler中的 json_dict。
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=01'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get ind comm pub info failed!')
            return False
        self.parser.parse_ind_comm_pub_pages(resp.content)

    @exe_time
    def crawl_ent_pub_pages(self, *args, **kwargs):
        """爬取企业公示信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=02'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get ent pub info failed!')
            return False
        self.parser.parse_ent_pub_pages(resp.content)

    @exe_time
    def crawl_other_dept_pub_pages(self, *args, **kwargs):
        """爬取其他部门公示信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=03'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get other dept pub info failed!')
            return False
        self.parser.parse_other_dept_pub_pages(resp.content)

    @exe_time
    def crawl_judical_assist_pub_pages(self, *args, **kwargs):
        """爬取司法协助信息页面
        """
        if not len(args): return
        url = args[0]
        m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', url)
        if m:
            self.uuid = m.group(1)
        next_url = self.urls['open_info_entry'] + 'uuid=' + self.uuid + '&tab=06'
        resp = self.reqst.get(next_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('get judical assist info failed!')
            return False
        self.parser.parse_judical_assist_pub_pages(resp.content)

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面，获取必要的信息
        """
        soup = BeautifulSoup(page, 'html5lib')
        divs = soup.find_all('div', attrs={'class': 'list-item'})

        if divs:
            Ent = {}
            count = 0
            for div in divs:
                count += 1
                link = div.find('div', attrs={'class': 'link'})
                profile = div.find('div', attrs={'class': 'profile'})
                url = ""
                ent = ""
                if link and link.find('a') and link.find('a').has_attr('href'):
                    url = link.find('a')['href']
                if profile and profile.span:
                    ent = profile.span.get_text().strip()
                name = link.find('a').get_text().strip()
                if name == self._ent:
                    Ent.clear()
                    Ent[ent] = url
                    break
                if count == 3:
                    break
                Ent[ent] = url
            self.ents = Ent
            return True
        else:
            return False

        # div_tag = soup.find('div', attrs={'class': 'link'})
        # if not div_tag:
        #     return False
        # open_info_url = div_tag.find('a').get('href')
        # m = re.search(r'[/\w\.\?]+=([\w\.=]+)&.+', open_info_url)
        # if m:
        #     self.uuid = m.group(1)
        #     return True
        # else:
        #     return False

    def parse_pre_check_page(self, page):
        """解析提交验证码之前的页面
        """
        soup = BeautifulSoup(page, 'html.parser')
        input_tag = soup.find('input', attrs={'type': 'hidden', 'name': 'session.token'})
        if input_tag:
            self.session_token = input_tag.get('value')
            return True
        return False

    def crawl_page_by_url(self, url):
        """通过url直接获取页面
        """
        resp = self.reqst.get(url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to crawl page by url' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        # if saveingtml:
        #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
        return page

    def crack_checkcode(self):
        """破解验证码"""
        checkcode_url = self.urls['get_checkcode'] + '&ra=' + str(random.random())
        ckcode = ('', '')
        resp = self.reqst.get(checkcode_url, timeout=self.timeout, verify=False)
        if resp.status_code != 200:
            logging.error('failed to get checkcode img')
            return ckcode
        page = resp.content

        time.sleep(random.uniform(1, 2))

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            logging.error('invalid code cracker with ckcode= None')
            return ckcode
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
            os.remove(self.ckcode_image_path)
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode

Пример #25

Показать файл

class HainanCrawler(object):

    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path):
        self.CR = CaptchaRecognition("guangdong")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        self.ents = []
        self.main_host = ""
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.html_restore_path = settings.json_restore_path + '/hainan/'
        #验证码图片的存储路径
        self.path_captcha = settings.json_restore_path + '/hainan/ckcode.png'

    #分析 展示页面， 获得搜索到的企业列表
    def analyze_showInfo(self, page):
        if page is None:
            logging.error(u"Getting Page ShowInfo failed\n")
            return
        Ent = []
        soup = BeautifulSoup(page, "html5lib")
        divs = soup.find_all("div", {"class": "list"})
        if divs:
            for div in divs:
                if div.find('a') and div.find('a').has_attr('href'):
                    Ent.append(div.find('a')['href'])
        else:
            return False
        self.ents = Ent
        return True

    # 破解验证码页面
    def crawl_page_captcha(self,
                           url_search,
                           url_Captcha,
                           url_CheckCode,
                           url_showInfo,
                           textfield='460000000265072'):
        r = self.requests.get(url_search)
        if r.status_code != 200:
            logging.error(
                u"Something wrong when getting the url:%s , status_code=%d",
                url, r.status_code)
            return
        count = 0
        while True:
            count += 1
            r = self.requests.get(url_Captcha)
            if r.status_code != 200:
                logging.error(
                    u"Something wrong when getting the Captcha url:%s , status_code=%d",
                    url_Captcha, r.status_code)
                continue
            if self.save_captcha(r.content):
                result = self.crack_captcha()
                print result
                datas = {
                    'textfield': textfield,
                    'code': result,
                }
                response = json.loads(
                    self.crawl_page_by_url_post(url_CheckCode, datas)['page'])
                # response返回的json结果: {u'flag': u'1', u'textfield': u'H+kiIP4DWBtMJPckUI3U3Q=='}
                if response['flag'] == "1":
                    datas_showInfo = {
                        'textfield': response['textfield'],
                        'code': result
                    }
                    page_showInfo = self.crawl_page_by_url_post(
                        url_showInfo, datas_showInfo)['page']
                    if self.analyze_showInfo(page_showInfo):
                        break

                else:
                    logging.debug(u"crack Captcha failed, the %d time(s)",
                                  count)
                    if count > 40:
                        break
        return

    #调用函数，破解验证码图片并返回结果
    def crack_captcha(self):
        if os.path.exists(self.path_captcha) is False:
            logging.error(u"Captcha path is not found\n")
            return
        result = self.CR.predict_result(self.path_captcha)
        return result[1]

    # 保存验证码图片
    def save_captcha(self, Captcha):
        url_Captcha = self.path_captcha
        if Captcha is None:
            logging.error(u"Can not store Captcha: None\n")
            return False
        self.write_file_mutex.acquire()
        f = open(url_Captcha, 'w')
        try:
            f.write(Captcha)
        except IOError:
            logging.debug("%s can not be written", url_Captcha)
        finally:
            f.close
        self.write_file_mutex.release()
        return True

    def parse_page_data_2(self, page):
        data = {
            "aiccipsUrl": "",
            "entNo": "",
            "entType": "",
            "regOrg": "",
        }
        try:
            soup = BeautifulSoup(page, "html5lib")
            data['aiccipsUrl'] = soup.find("input",
                                           {"id": "aiccipsUrl"})['value']
            data['entNo'] = soup.find("input", {"id": "entNo"})['value']
            data['entType'] = soup.find(
                "input", {"id": "entType"})['value'].strip()  #+"++"
            data['regOrg'] = soup.find("input", {"id": "regOrg"})['value']

        except Exception as e:
            logging.error(u"parse page failed in function parse_page_data_2\n")
            raise e
        finally:
            return data

    def crawl_page_main(self):
        sub_json_dict = {}
        if not self.ents:
            logging.error(u"Get no search result\n")
        try:
            for ent in self.ents:
                m = re.match('http', ent)
                if m is None:
                    ent = urls['host'] + ent[3:]
                logging.debug(u"ent url:%s\n" % ent)
                url = ent
                page_entInfo = self.crawl_page_by_url(url)['page']
                post_data = self.parse_page_data_2(page_entInfo)
                sub_json_dict.update(
                    self.crawl_ind_comm_pub_pages(url, post_data))
                url = "http://aic.hainan.gov.cn:1888/aiccips/BusinessAnnals/BusinessAnnalsList.html"
                sub_json_dict.update(self.crawl_ent_pub_pages(url, post_data))
                url = "http://aic.hainan.gov.cn:1888/aiccips/OtherPublicity/environmentalProtection.html"
                sub_json_dict.update(
                    self.crawl_other_dept_pub_pages(url, post_data))
                url = "http://aic.hainan.gov.cn:1888/aiccips/judiciaryAssist/judiciaryAssistInit.html"
                sub_json_dict.update(
                    self.crawl_judical_assist_pub_pages(url, post_data))

        except Exception as e:
            logging.error(
                u"An error ocurred when getting the main page, error: %s" %
                type(e))
            raise e
        finally:
            return sub_json_dict

    # 爬取 工商公示信息 页面
    def crawl_ind_comm_pub_pages(self, url, post_data={}):
        sub_json_dict = {}
        try:
            tabs = (
                'entInfo',  # 登记信息
                'curStoPleInfo',  #股权出质
                'entCheckInfo',  #备案信息
                'pleInfo',  #动产抵押登记信息
                'cipPenaltyInfo',  #行政处罚
                'cipUnuDirInfo',  #经营异常
                'cipBlackInfo',  #严重违法
                'cipSpotCheInfo',  #抽查检查
            )

            div_names = (
                'jibenxinxi',
                'guquanchuzhi',
                'beian',
                'dongchandiya',
                'xingzhengchufa',
                'jingyingyichang',
                'yanzhongweifa',
                'chouchajiancha',
            )
            for tab, div_name in zip(tabs, div_names):
                #url = "http://http://aic.hainan.gov.cn:1888/aiccips/GSpublicity/GSpublicityList.html?service=" + tab
                url = urls['prefix_GSpublicity'] + tab
                page = self.crawl_page_by_url_post(url, post_data)['page']
                if div_name == 'jibenxinxi':
                    dict_jiben = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_reg_modify'] = dict_jiben[
                        u'变更信息'] if dict_jiben.has_key(u"变更信息") else {}
                    sub_json_dict['ind_comm_pub_reg_basic'] = dict_jiben[
                        u'基本信息'] if dict_jiben.has_key(u"基本信息") else []
                    sub_json_dict['ind_comm_pub_reg_shareholder'] = dict_jiben[
                        u'股东信息'] if dict_jiben.has_key(u"股东信息") else []
                elif div_name == 'beian':
                    dict_beian = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_arch_key_persons'] = dict_beian[
                        u'主要人员信息'] if dict_beian.has_key(u"主要人员信息") else []
                    sub_json_dict['ind_comm_pub_arch_branch'] = dict_beian[
                        u'分支机构信息'] if dict_beian.has_key(u"分支机构信息") else []
                    sub_json_dict[
                        'ind_comm_pub_arch_liquidation'] = dict_beian[
                            u"清算信息"] if dict_beian.has_key(u'清算信息') else []
                elif div_name == 'guquanchuzhi':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_equity_ownership_reg'] = dj[
                        u'股权出质登记信息'] if dj.has_key(u'股权出质登记信息') else []
                elif div_name == 'dongchandiya':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_movable_property_reg'] = dj[
                        u'动产抵押信息'] if dj.has_key(u'动产抵押信息') else []
                elif div_name == 'xingzhengchufa':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_administration_sanction'] = dj[
                        u'行政处罚信息'] if dj.has_key(u'行政处罚信息') else []
                elif div_name == 'jingyingyichang':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_business_exception'] = dj[
                        u'经营异常信息'] if dj.has_key(u'经营异常信息') else []
                elif div_name == 'yanzhongweifa':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_serious_violate_law'] = dj[
                        u'严重违法信息'] if dj.has_key(u'严重违法信息') else []
                elif div_name == 'chouchajiancha':
                    dj = self.parse_page_2(page, div_name, post_data)
                    sub_json_dict['ind_comm_pub_spot_check'] = dj[
                        u'抽查检查信息'] if dj.has_key(u'抽查检查信息') else []

        except Exception as e:
            logging.debug(u"An error ocurred in crawl_ind_comm_pub_pages: %s" %
                          (type(e)))
            raise e
        finally:
            return sub_json_dict

    #爬取 企业公示信息 页面
    def crawl_ent_pub_pages(self, url, post_data={}):
        sub_json_dict = {}
        try:
            page = self.crawl_page_by_url_post(
                urls['host'] + "/BusinessAnnals/BusinessAnnalsList.html",
                post_data)['page']
            p = self.parse_page_2(page, 'qiyenianbao', post_data)
            sub_json_dict['ent_pub_ent_annual_report'] = p[
                u'qiyenianbao'] if p.has_key(u'qiyenianbao') else []

            page = self.crawl_page_by_url_post(
                urls['host'] + "/AppPerInformation.html", post_data)['page']
            p = self.parse_page_2(page, 'appPer', post_data)
            sub_json_dict['ent_pub_administration_license'] = p[
                u'行政许可情况'] if p.has_key(u'行政许可情况') else []

            page = self.crawl_page_by_url_post(
                urls['host'] + "/XZPunishmentMsg.html", post_data)['page']
            p = self.parse_page_2(page, 'xzpun', post_data)
            sub_json_dict['ent_pub_administration_sanction'] = p[
                u'行政处罚情况'] if p.has_key(u'行政处罚情况') else []

            page = self.crawl_page_by_url_post(
                urls['host'] + "/ContributionCapitalMsg.html",
                post_data)['page']
            p = self.parse_page_2(page, 'sifapanding', post_data)
            sub_json_dict['ent_pub_shareholder_capital_contribution'] = p[
                u'股东及出资信息'] if p.has_key(u'股东及出资信息') else []
            sub_json_dict['ent_pub_reg_modify'] = p[u'变更信息'] if p.has_key(
                u'变更信息') else []

            page = self.crawl_page_by_url_post(
                urls['host'] + "/GDGQTransferMsg/shareholderTransferMsg.html",
                post_data)['page']
            p = self.parse_page_2(page, 'guquanbiangeng', post_data)
            sub_json_dict['ent_pub_equity_change'] = p[u'股权变更信息'] if p.has_key(
                u'股权变更信息') else []

            page = self.crawl_page_by_url_post(
                urls['host'] + "/intPropertyMsg.html", post_data)['page']
            p = self.parse_page_2(page, 'inproper', post_data)
            sub_json_dict['ent_pub_knowledge_property'] = p[
                u'知识产权出质登记信息'] if p.has_key(u'知识产权出质登记信息') else []
        except Exception as e:
            logging.debug(u"An error ocurred in crawl_ent_pub_pages: %s" %
                          (type(e)))
            raise e
        finally:
            return sub_json_dict
        #json_dump_to_file("json_dict.json", self.json_dict)

    #爬取 其他部门公示信息 页面
    def crawl_other_dept_pub_pages(self, url, post_data={}):
        sub_json_dict = {}
        try:
            page = self.crawl_page_by_url_post(
                urls['host'] + "/OtherPublicity/environmentalProtection.html",
                post_data)['page']
            xk = self.parse_page_2(page, "xzxk", post_data)
            sub_json_dict["other_dept_pub_administration_license"] = xk[
                u'行政许可信息'] if xk.has_key(u'行政许可信息') else []
            page = self.crawl_page_by_url_post(
                urls['host'] + "/OtherPublicity/environmentalProtection.html",
                post_data)['page']
            xk = self.parse_page_2(page, "czcf", post_data)
            sub_json_dict["other_dept_pub_administration_sanction"] = xk[
                u'行政处罚信息'] if xk.has_key(u'行政处罚信息') else []  # 行政处罚信息
        except Exception as e:
            logging.debug(
                u"An error ocurred in crawl_other_dept_pub_pages: %s" %
                (type(e)))
            raise e
        finally:
            return sub_json_dict

    #judical assist pub informations
    def crawl_judical_assist_pub_pages(self, url, post_data={}):
        sub_json_dict = {}
        try:
            page = self.crawl_page_by_url_post(
                urls['host'] + "/judiciaryAssist/judiciaryAssistInit.html",
                post_data)['page']
            xz = self.parse_page_2(page, 'guquandongjie', post_data)
            sub_json_dict['judical_assist_pub_equity_freeze'] = xz[
                u'司法股权冻结信息'] if xz.has_key(u'司法股权冻结信息') else []
            page = self.crawl_page_by_url_post(
                urls['host'] + "/sfGuQuanChange/guQuanChange.html",
                post_data)['page']
            xz = self.parse_page_2(page, 'gudongbiangeng', post_data)
            sub_json_dict['judical_assist_pub_shareholder_modify'] = xz[
                u'司法股东变更登记信息'] if xz.has_key(u'司法股东变更登记信息') else []
        except Exception as e:
            logging.debug(
                u"An error ocurred in crawl_other_dept_pub_pages: %s" %
                (type(e)))
            raise e
        finally:
            return sub_json_dict
        pass

    def get_raw_text_by_tag(self, tag):
        return tag.get_text().strip()

    #获得表头
    def get_table_title(self, table_tag):
        if table_tag.find('tr'):
            if table_tag.find('tr').find_all('th'):

                if len(table_tag.find('tr').find_all('th')) > 1:
                    return None
                # 处理 <th> aa<span> bb</span> </th>
                if table_tag.find('tr').th.stirng == None and len(
                        table_tag.find('tr').th.contents) > 1:
                    # 处理 <th>   <span> bb</span> </th>  包含空格的
                    if (table_tag.find('tr').th.contents[0]).strip():
                        return (table_tag.find('tr').th.contents[0])
                # <th><span> bb</span> </th>
                return self.get_raw_text_by_tag(table_tag.find('tr').th)
            elif table_tag.find('tr').find('td'):
                return self.get_raw_text_by_tag(table_tag.find('tr').td)
        return None

    def sub_column_count(self, th_tag):
        if th_tag.has_attr('colspan') and th_tag.get('colspan') > 1:
            return int(th_tag.get('colspan'))
        return 0

    def get_sub_columns(self, tr_tag, index, count):
        columns = []
        for i in range(index, index + count):
            th = tr_tag.find_all('th')[i]
            if not self.sub_column_count(th):
                columns.append((self.get_raw_text_by_tag(th),
                                self.get_raw_text_by_tag(th)))
            else:
                #if has sub-sub columns
                columns.append(
                    (self.get_raw_text_by_tag(th),
                     self.get_sub_columns(tr_tag.nextSibling.nextSibling, 0,
                                          self.sub_column_count(th))))
        return columns

    #get column data recursively, use recursive because there may be table in table
    def get_column_data(self, columns, td_tag):
        if type(columns) == list:
            data = {}
            multi_col_tag = td_tag
            if td_tag.find('table'):
                multi_col_tag = td_tag.find('table').find('tr')
            if not multi_col_tag:
                logging.error('invalid multi_col_tag, multi_col_tag = %s',
                              multi_col_tag)
                return data

            if len(columns) != len(
                    multi_col_tag.find_all('td', recursive=False)):
                logging.error(
                    'column head size != column data size, columns head = %s, columns data = %s'
                    % (columns, multi_col_tag.contents))
                return data

            for id, col in enumerate(columns):
                data[col[0]] = self.get_column_data(
                    col[1],
                    multi_col_tag.find_all('td', recursive=False)[id])
            return data
        else:
            return self.get_raw_text_by_tag(td_tag)

    def get_detail_link(self, bs4_tag):
        if bs4_tag['href'] and bs4_tag['href'] != '#':
            pattern = re.compile(r'http')
            if pattern.search(bs4_tag['href']):
                return bs4_tag['href']
            return urls['prefix_url'] + bs4_tag['href']
        elif bs4_tag['onclick']:
            return self.get_detail_link_onclick(bs4_tag)

    def get_detail_link_onclick(self, bs4_tag):
        re1 = '.*?'  # Non-greedy match on filler
        re2 = '(\\\'.*?\\\')'  # Single Quote String 1

        rg = re.compile(re1 + re2, re.IGNORECASE | re.DOTALL)
        m = rg.search(bs4_tag['onclick'])
        url = ""
        if m:
            strng1 = m.group(1)
            url = strng1.strip("\'")
        return url

    def get_columns_of_record_table(self, bs_table, page, table_name):
        tbody = None
        if len(bs_table.find_all('tbody')) > 1:
            tbody = bs_table.find_all('tbody')[1]
        else:
            tbody = bs_table.find('tbody') or BeautifulSoup(
                page, 'html5lib').find('tbody')

        tr = None
        if tbody:
            if len(tbody.find_all('tr')) <= 1:
                #tr = tbody.find('tr')
                tr = None
            else:
                tr = tbody.find_all('tr')[1]
                if not tr.find('th'):
                    tr = tbody.find_all('tr')[0]
                elif tr.find('td'):
                    tr = None
        else:
            if len(bs_table.find_all('tr')) <= 1:
                return None
            elif bs_table.find_all('tr')[0].find(
                    'th'
            ) and not bs_table.find_all('tr')[0].find('td') and len(
                    bs_table.find_all('tr')[0].find_all('th')) > 1:
                tr = bs_table.find_all('tr')[0]
            elif bs_table.find_all('tr')[1].find(
                    'th'
            ) and not bs_table.find_all('tr')[1].find('td') and len(
                    bs_table.find_all('tr')[1].find_all('th')) > 1:
                tr = bs_table.find_all('tr')[1]
        ret_val = self.get_record_table_columns_by_tr(tr, table_name)
        return ret_val

    def get_record_table_columns_by_tr(self, tr_tag, table_name):
        columns = []
        if not tr_tag:
            return columns
        try:
            sub_col_index = 0
            if len(tr_tag.find_all('th')) == 0:
                logging.error(u"The table %s has no columns" % table_name)
                return columns
            #排除仅仅出现一列重复的名字
            count = 0
            for i, th in enumerate(tr_tag.find_all('th')):
                col_name = self.get_raw_text_by_tag(th)
                #if col_name and ((col_name, col_name) not in columns) :

                if col_name:
                    if ((col_name, col_name) in columns):
                        col_name = col_name + '_'
                        count += 1
                    if not self.sub_column_count(th):
                        columns.append((col_name, col_name))
                    else:  #has sub_columns
                        columns.append((col_name,
                                        self.get_sub_columns(
                                            tr_tag.nextSibling.nextSibling,
                                            sub_col_index,
                                            self.sub_column_count(th))))
                        sub_col_index += self.sub_column_count(th)
            if count == len(tr_tag.find_all('th')) / 2:
                columns = columns[:len(columns) / 2]

        except Exception as e:
            logging.error(
                u'exception occured in get_table_columns, except_type = %s, table_name = %s'
                % (type(e), table_name))
        finally:
            return columns

    # 如果是第二种： http://gsxt.gdgs.gov.cn/aiccips/ q情况
    def parse_ent_pub_annual_report_page_2(self, base_page, page_type):

        page_data = {}
        soup = BeautifulSoup(base_page, 'html5lib')
        if soup.body.find('table'):
            try:
                base_table = soup.body.find('table')
                table_name = u'企业基本信息'  #self.get_table_title(base_table)
                #这里需要连续两个nextSibling，一个nextSibling会返回空
                detail_base_table = base_table.nextSibling.nextSibling
                if detail_base_table.name == 'table':
                    page_data[table_name] = self.parse_table_2(
                        detail_base_table)
                    pass
                else:
                    logging.error(
                        u"Can't find details of base informations for annual report"
                    )
            except Exception as e:
                logging.error(u"fail to get table name with exception %s" %
                              (type(e)))
            try:
                table = detail_base_table.nextSibling.nextSibling
                while table:
                    if table.name == 'table':
                        table_name = self.get_table_title(table)
                        page_data[table_name] = []
                        columns = self.get_columns_of_record_table(
                            table, base_page, table_name)
                        page_data[table_name] = self.parse_table_2(
                            table, columns, {}, table_name)
                    table = table.nextSibling
            except Exception as e:
                logging.error(
                    u"fail to parse the rest tables with exception %s" %
                    (type(e)))
        else:
            pass
        return page_data

    def get_particular_table(self, table, page):
        """ 获取 股东及出资信息的表格，按照指定格式输出
        """
        table_dict = {}
        sub_dict = {}
        table_list = []
        try:
            trs = table.find_all('tr')
            for tr in trs:
                if tr.find('td'):
                    tds = tr.find_all('td')
                    if len(tds) <= 1:
                        continue
                    table_dict[u'股东'] = self.get_raw_text_by_tag(tds[0])
                    table_dict[u'股东类型'] = self.get_raw_text_by_tag(tds[1])
                    sub_dict = {}
                    sub_dict[u'认缴出资额（万元）'] = self.get_raw_text_by_tag(tds[2])
                    sub_dict[u'认缴出资方式'] = self.get_raw_text_by_tag(tds[3])
                    sub_dict[u'认缴出资日期'] = self.get_raw_text_by_tag(tds[4])
                    table_dict['认缴明细'] = sub_dict
                    sub_dict = {}
                    sub_dict[u'实缴出资额（万元）'] = self.get_raw_text_by_tag(tds[5])
                    sub_dict[u'实缴出资方式'] = self.get_raw_text_by_tag(tds[6])
                    sub_dict[u'实缴出资时间'] = self.get_raw_text_by_tag(tds[7])
                    table_dict['实缴明细'] = sub_dict

                    table_dict['实缴额（万元）'] = self.get_raw_text_by_tag(tds[5])
                    table_dict['认缴额（万元）'] = self.get_raw_text_by_tag(tds[2])
                    table_list.append(table_dict)
        except Exception as e:
            logging.error(u'parse 股东及出资信息 table failed! : %s' % e)
        return table_list

    def parse_page(self, page, div_id='jibenxinxi'):
        soup = BeautifulSoup(page, 'html5lib')
        page_data = {}
        try:
            div = soup.find('div', attrs={'id': div_id})
            if div:
                tables = div.find_all('table')
            else:
                tables = soup.find_all('table')
            #print table
            for table in tables:
                table_name = self.get_table_title(table)
                if table_name:
                    if table_name == u"股东及出资信息":
                        page_data[table_name] = self.get_particular_table(
                            table, page)
                    else:
                        page_data[table_name] = self.parse_table(
                            table, table_name, page)
        except Exception as e:
            logging.error(u'parse page failed, with exception %s' % e)
            raise e
        finally:
            return page_data

    def parse_page_2(self, page, div_id, post_data={}):
        soup = BeautifulSoup(page, 'html5lib')
        page_data = {}
        if soup.body:
            if soup.body.table:
                try:
                    divs = soup.body.find('div', {"id": div_id})
                    table = None
                    if not divs:
                        table = soup.body.find('table')
                    else:
                        table = divs.find('table')
                    #print table
                    table_name = ""
                    columns = []
                    while table:
                        if table.name == 'table':
                            table_name = self.get_table_title(table)
                            if table_name is None:
                                table_name = div_id
                            page_data[table_name] = []
                            columns = self.get_columns_of_record_table(
                                table, page, table_name)
                            result = self.parse_table_2(
                                table, columns, post_data, table_name)
                            if not columns and not result:
                                del page_data[table_name]
                            else:
                                page_data[table_name] = result

                        elif table.name == 'div':
                            if not columns:
                                logging.error(
                                    u"Can not find columns when parsing page 2, table :%s"
                                    % div_id)
                                break
                            page_data[table_name] = self.parse_table_2(
                                table, columns, post_data, table_name)
                            columns = []
                        table = table.nextSibling

                except Exception as e:
                    logging.error(u'parse failed, with exception %s' % e)
                    raise e

                finally:
                    pass
        return page_data

    def parse_table_2(self, bs_table, columns=[], post_data={}, table_name=""):
        table_dict = None
        try:
            tbody = bs_table.find('tbody') or BeautifulSoup(
                page, 'html5lib').find('tbody')
            if columns:
                col_span = 0
                for col in columns:
                    if type(col[1]) == list:
                        col_span += len(col[1])
                    else:
                        col_span += 1

                column_size = len(columns)
                item_array = []
                # <div> <table>数据</table><table>下一页</table> </div>
                tables = bs_table.find_all('table')
                if len(tables) == 2 and tables[1].find('a'):
                    # 获取下一页的url
                    clickstr = tables[1].find('a')['onclick']

                    re1 = '.*?'  # Non-greedy match on filler
                    re2 = '\\\'.*?\\\''  # Uninteresting: strng
                    re3 = '.*?'  # Non-greedy match on filler
                    re4 = '(\\\'.*?\\\')'  # Single Quote String 1
                    re5 = '.*?'  # Non-greedy match on filler
                    re6 = '(\\\'.*?\\\')'  # Single Quote String 2

                    rg = re.compile(re1 + re2 + re3 + re4 + re5 + re6,
                                    re.IGNORECASE | re.DOTALL)
                    m = rg.search(clickstr)
                    url = ""
                    if m:
                        string1 = m.group(1)
                        string2 = m.group(2)
                        url = string1.strip('\'') + string2.strip('\'')
                        logging.debug(u"url = %s\n" % url)
                    data = {
                        "pageNo": 2,
                        "entNo": post_data["entNo"].encode('utf-8'),
                        "regOrg": post_data["regOrg"],
                        "entType": post_data["entType"].encode('utf-8'),
                    }
                    res = self.crawl_page_by_url_post(url, data)
                    #print res['page']
                    if table_name == u"变更信息":
                        # chaToPage
                        d = json.loads(res['page'])
                        titles = [column[0] for column in columns]
                        for i, model in enumerate(d['list']):
                            data = [
                                model['altFiledName'], model['altBe'],
                                model['altAf'], model['altDate']
                            ]
                            item_array.append(dict(zip(titles, data)))
                    elif table_name == u"主要人员信息":
                        # vipToPage
                        d = json.loads(res['page'], encoding="utf-8")
                        titles = [column[0] for column in columns]
                        for i, model in enumerate(d['list']):
                            data = [i + 1, model['name'], model['position']]
                            item_array.append(dict(zip(titles, data)))

                    elif table_name == u"分支机构信息":
                        #braToPage
                        #print u"分支机构"
                        d = json.loads(res['page'])
                        titles = [column[0] for column in columns]
                        for i, model in enumerate(d['list']):
                            data = [
                                i + 1, model['regNO'],
                                model['brName'].encode('utf8').decode('utf8'),
                                model['regOrg'].encode('utf8')
                            ]
                            item_array.append(dict(zip(titles, data)))

                    elif table_name == u"股东信息":
                        #print "股东信息"
                        d = json.loads(res['page'])
                        titles = [column[0] for column in columns]
                        for i, model in enumerate(d['list']):
                            data = [
                                model['invType'], model['inv'],
                                model['certName'], mode['certNo']
                            ]
                            item_array.append(dict(zip(titles, data)))
                        pass

                    table_dict = item_array

                else:

                    if not tbody:
                        records_tag = tables[0]
                    else:
                        records_tag = tbody
                    for tr in records_tag.find_all('tr'):
                        if tr.find('td') and len(
                                tr.find_all(
                                    'td', recursive=False)) % column_size == 0:
                            col_count = 0
                            item = {}
                            print "table_name=%s" % table_name
                            for td in tr.find_all('td', recursive=False):
                                if td.find('a'):
                                    next_url = self.get_detail_link(
                                        td.find('a'))
                                    print next_url
                                    if re.match(r"http", next_url):
                                        detail_page = self.crawl_page_by_url(
                                            next_url)
                                        #html_to_file("next.html", detail_page['page'])
                                        if table_name == u'qiyenianbao':
                                            print "in table_name"
                                            page_data = self.parse_ent_pub_annual_report_page_2(
                                                detail_page['page'],
                                                table_name + '_detail')
                                            item[columns[col_count]
                                                 [0]] = self.get_column_data(
                                                     columns[col_count][1], td)
                                            item[u'详情'] = page_data
                                        else:
                                            page_data = self.parse_page(
                                                detail_page['page'],
                                                table_name + '_detail')
                                            item[columns[col_count][
                                                0]] = page_data  #this may be a detail page data
                                    else:
                                        item[columns[col_count]
                                             [0]] = self.get_column_data(
                                                 columns[col_count][1], td)
                                else:
                                    item[columns[col_count]
                                         [0]] = self.get_column_data(
                                             columns[col_count][1], td)
                                col_count += 1
                                if col_count == column_size:
                                    item_array.append(item.copy())
                                    col_count = 0
                        #this case is for the ind-comm-pub-reg-shareholders----details'table
                        elif tr.find('td') and len(
                                tr.find_all('td', recursive=False)
                        ) == col_span and col_span != column_size:
                            col_count = 0
                            sub_col_index = 0
                            item = {}
                            sub_item = {}
                            for td in tr.find_all('td', recursive=False):
                                if td.find('a'):
                                    #try to retrieve detail link from page
                                    next_url = self.get_detail_link(
                                        td.find('a'))
                                    #has detail link
                                    if next_url:
                                        detail_page = self.crawl_page_by_url(
                                            next_url)['page']
                                        if table_name == 'qiyenianbao':
                                            page_data = self.parse_ent_pub_annual_report_page_2(
                                                detail_page['page'],
                                                table_name + '_detail')
                                            item[columns[col_count]
                                                 [0]] = self.get_column_data(
                                                     columns[col_count][1], td)
                                            item[u'详情'] = page_data
                                        else:
                                            page_data = self.parse_page(
                                                detail_page['page'],
                                                table_name + '_detail')
                                            item[columns[col_count][
                                                0]] = page_data  #this may be a detail page data
                                    else:
                                        item[columns[col_count]
                                             [0]] = self.get_column_data(
                                                 columns[col_count][1], td)
                                else:
                                    if type(columns[col_count][1]) == list:
                                        sub_key = columns[col_count][1][
                                            sub_col_index][1]
                                        sub_item[
                                            sub_key] = self.get_raw_text_by_tag(
                                                td)
                                        sub_col_index += 1
                                        if sub_col_index == len(
                                                columns[col_count][1]):
                                            item[columns[col_count]
                                                 [0]] = sub_item.copy()
                                            sub_item = {}
                                            col_count += 1
                                            sub_col_index = 0
                                    else:
                                        item[columns[col_count]
                                             [0]] = self.get_column_data(
                                                 columns[col_count][1], td)
                                        col_count += 1
                                if col_count == column_size:
                                    item_array.append(item.copy())
                                    col_count = 0

                    table_dict = item_array
            else:
                table_dict = {}
                for tr in bs_table.find_all('tr'):
                    if tr.find('th') and tr.find('td'):
                        ths = tr.find_all('th')
                        tds = tr.find_all('td')
                        if len(ths) != len(tds):
                            logging.error(
                                u'th size not equals td size in table %s, what\'s up??'
                                % table_name)
                            return
                        else:
                            for i in range(len(ths)):
                                if self.get_raw_text_by_tag(ths[i]):
                                    table_dict[self.get_raw_text_by_tag(
                                        ths[i])] = self.get_raw_text_by_tag(
                                            tds[i])
        except Exception as e:
            logging.error(u'parse table %s failed with exception %s' %
                          (table_name, type(e)))
            raise e
        finally:
            return table_dict

    def crawl_page_by_url(self, url, header={}):
        self.requests.headers.update(header)
        r = self.requests.get(url)
        if r.status_code != 200:
            logging.error(u"Getting page by url:%s\n, return status %s\n" %
                          (url, r.status_code))
        # 为了防止页面间接跳转，获取最终目标url
        return {'page': r.text, 'url': r.url}

    def crawl_page_by_url_post(self, url, datas, header={}):
        self.requests.headers.update(header)
        r = self.requests.post(url, data=datas)
        if r.status_code != 200:
            logging.error(
                u"Getting page by url with post:%s\n, return status %s\n" %
                (url, r.status_code))
        return {'page': r.text, 'url': r.url}

    # main function
    def run(self, ent_num):
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)
        json_dict = {}
        self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'],
                                urls['checkcode'], urls['page_showinfo'],
                                ent_num)
        data = self.crawl_page_main()
        json_dict[ent_num] = data
        #json_dump_to_file(self.json_restore_path , json_dict)
        # 2016-2-16
        return json.dumps(json_dict)

    def work(self, ent_num):
        # if not os.path.exists(self.html_restore_path):
        #     os.makedirs(self.html_restore_path)
        self.crawl_page_captcha(urls['page_search'], urls['page_Captcha'],
                                urls['checkcode'], urls['page_showinfo'],
                                ent_num)
        data = self.crawl_page_main()
        json_dump_to_file('hainan.json', data)

Пример #26

Показать файл

Файл: chongqing_crawler.py Проект: xiaohui2856/clawer

class ChongqingClawer(Crawler):
    """重庆工商公示信息网页爬虫
    """

    # 多线程爬取时往最后的json文件中写时的加锁保护
    urls = {
        'host': 'http://gsxt.cqgs.gov.cn',
        'get_checkcode': 'http://gsxt.cqgs.gov.cn/sc.action?width=130&height=40',
        'repost_checkcode': 'http://gsxt.cqgs.gov.cn/search_research.action',
        # 获得查询页面
        'post_checkcode': 'http://gsxt.cqgs.gov.cn/search.action',
        # 根据查询页面获得指定公司的数据
        'search_ent': 'http://gsxt.cqgs.gov.cn/search_getEnt.action',
        # 年报
        'year_report': 'http://gsxt.cqgs.gov.cn/search_getYearReport.action',
        # 年报详情
        'year_report_detail': 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action',
        # 股权变更
        'year_daily_transinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 股东出资信息
        'year_daily_invsub': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 行政处罚
        'year_daily_peninfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 行政许可
        'year_daily_licinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 知识产权出质登记
        'year_daily_pleinfo': 'http://gsxt.cqgs.gov.cn/search_getDaily.action',
        # 其他行政许可信息
        'other_qlicinfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action',
        # 其他行政处罚
        'other_qpeninfo': 'http://gsxt.cqgs.gov.cn/search_getOtherSectors.action',
        # 股权冻结信息
        'sfxz_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZ.action',
        # 股东变更信息
        'sfxzgdbg_page': 'http://gsxt.cqgs.gov.cn/search_getSFXZGDBG.action',
    }
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径，所有重庆的企业，应该写入同一个文件，因此在多线程爬取时设置相同的路径。同时，
            需要在写入文件的时候加锁
        Returns:
        """
        # json 数据集
        # POST

        self.json_restore_path = json_restore_path
        if os.path.exists(self.json_restore_path) is False:
            os.makedirs(self.json_restore_path, 0775)
        self.parser = ChongqingParser(self)
        self.credit_ticket = None
        #html数据的存储路径
        self.html_restore_path = os.path.join(self.json_restore_path, "/chongqing/")
        if os.path.exists(self.html_restore_path) is False:
            os.makedirs(self.html_restore_path, 0775)
        #验证码图片的存储路径
        self.ckcode_image_path = os.path.join(self.html_restore_path, 'ckcode.jpg')
        self.code_cracker = CaptchaRecognition("chongqing")
        self.ent_number = None
        # GET
        self.ckcode = None
        self.json_ent_info = None
        self.json_sfxzgdbg = None
        self.json_sfxz = None
        self.json_other_qlicinfo = None
        self.json_other_qpeninfo = None
        self.json_year_report = None
        self.json_year_report_detail = None
        self.json_year_daily_transinfo = None
        self.json_year_daily_invsub = None
        self.json_year_daily_peninfo = None
        self.json_year_daily_licinfo = None
        self.json_year_daily_pleinfo = None
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.parser = ChongqingParser(self)
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'})

    def run(self, ent_number=0):
        self.ent_number = str(ent_number)
        page = self.crawl_check_page()
        try:
            self.crawl_page_jsons(page)
            self.parser.parse_jsons()
            self.parser.merge_jsons()
        except Exception as e:
            # logging.error('error')
            return None

        return json.dumps({self.ent_number: self.json_dict})


    def crawl_check_page(self):
        """爬取验证码页面，包括下载验证码图片以及破解验证码
        :return true or false
        """
        count = 0
        while count < 30:
            ck_code = self.crack_check_code()
            data = {'key':self.ent_number,'code':ck_code}
            resp = self.reqst.post(ChongqingClawer.urls['post_checkcode'], data=data)
            if resp.status_code != 200:
                logging.error("crawl post check page failed!")
                count += 1
                continue
            return resp.content
        return None

    def crack_check_code(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(ChongqingClawer.urls['get_checkcode'])
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            return None
        time.sleep(random.uniform(0.1, 0.2))
        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)

        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
            # ckcode = self.code_cracker.predict_result(self.ckcode_image_dir_path + 'image' + str(i) + '.jpg')
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()

        return ckcode[1]

    def crack_checkcode(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(ChongqingClawer.urls['get_checkcode'])
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            print 'error'
            return None

        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        self.ckcode_image_path = settings.json_restore_path + '/chongqing/ckcode.jpg'
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)
        self.write_file_mutex.release()

        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass

        return ckcode[1]

    def crawl_page_jsons(self,page):
        """获取所有界面的json数据"""
        data = self.parser.parse_search_results_pages(page)
        if data is not None:
            self.crawl_ent_info_json(data)
            self.crawl_year_report_json(data)

            self.crawl_year_report_detail_json(data)
            # print(self.json_year_report_detail)
            # time.sleep(0.1)
            self.crawl_sfxzgdbg_json(data)
            # print(self.json_sfxzgdbg)
            # time.sleep(0.1)
            self.crawl_sfxz_json(data)
            # print(self.json_sfxz)
            # time.sleep(0.1)
            self.crawl_year_daily_invsub_json(data)
            # print(self.json_year_daily_invsub)
            # time.sleep(0.1)
            self.crawl_year_daily_licinfo_json(data)
            # print(self.json_year_daily_licinfo)
            # time.sleep(0.1)
            self.crawl_year_daily_peninfo_json(data)
            # print(self.json_year_daily_peninfo)
            # time.sleep(0.1)
            self.crawl_year_daily_transinfo_json(data)
            # print(self.json_year_daily_transinfo)
            # time.sleep(0.1)
            self.crawl_year_daily_pleinfo_json(data)
            # print(self.json_year_daily_pleinfo)
            # time.sleep(0.1)
            self.crawl_other_qpeninfo_json(data)
            # print(self.json_other_qpeninfo)
            # time.sleep(0.1)
            self.crawl_other_qlicinfo_json(data)
            # print(self.json_other_qlicinfo)
        else:
            print('error')

    def crawl_ent_info_json(self, data, type=1):
        """企业详细信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': type}
        json_data = self.reqst.get(ChongqingClawer.urls['search_ent'], params=params)
        if json_data.status_code == 200:
            json_data = json_data.content
            json_data = str(json_data)
            self.json_ent_info = json_data[6:]  # 去掉数据中的前六个字符保证数据为完整json格式数据
            if self.json_ent_info is None or 'base' not in self.json_ent_info:
                self.crawl_ent_info_json(data, type=10)  # 有些公司需要传过去的参数为 10
                # print(self.json_ent_info)

    def crawl_year_report_json(self, data):
        """年报数据"""
        params = {'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingClawer.urls['year_report'], params=params)
        while json_data.status_code != 200:
            json_data = self.reqst.get(ChongqingClawer.urls['year_report'], params=params)
        json_data = json_data.content
        json_data = str(json_data)
        self.json_year_report = json_data[6:]  # 去掉数据中的前六个字符保证数据为完整json格式数据
        # print(self.json_year_report)

    def crawl_year_report_detail_json(self, data):
        """详细年报"""
        # TO DO 需要获得 year_report 中的年份信息
        while self.json_year_report is None:
            self.crawl_year_report_json(data)
        year_report = json.loads(self.json_year_report, encoding='utf-8')
        histories = year_report.get('history')
        for i in range(len(histories)):
            year = histories[i].get('year')
            params = {'id': data.get('id'), 'type': 1, 'year': str(year)}
            json_data = self.reqst.get(ChongqingClawer.urls['year_report_detail'], params=params)
            if json_data.status_code == 200:
                # 此页面响应结果直接就是 json_data
                self.json_year_report_detail = str(json_data.content)
                # print(self.json_year_report_detail)

    def crawl_year_daily_transinfo_json(self, data):
        """股权变更"""
        params = {'id': data.get('id'), 'jtype': 'transinfo'}
        json_data = self.reqst.get(ChongqingClawer.urls['year_daily_transinfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_transinfo = json_data[6:]
            # print(self.json_year_daily_transinfo)

    def crawl_year_daily_pleinfo_json(self, data):
        """行政许可"""
        params = {'id': data.get('id'), 'jtype': 'pleinfo'}
        json_data = self.reqst.get(ChongqingClawer.urls['year_daily_pleinfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_pleinfo = json_data[6:]
            # print(self.json_year_daily_pleinfo)

    def crawl_year_daily_invsub_json(self, data):
        """股东出资信息"""
        params = {'id': data.get('id'), 'jtype': 'invsub'}
        json_data = self.reqst.get(ChongqingClawer.urls['year_daily_invsub'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_invsub = json_data[6:]
            # print(self.json_year_daily_invsub)

    def crawl_year_daily_licinfo_json(self, data):
        """行政许可"""
        params = {'id': data.get('id'), 'jtype': 'licinfo'}
        json_data = self.reqst.get(ChongqingClawer.urls['year_daily_licinfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_licinfo = json_data[6:]
            # print(self.json_year_daily_licinfo)

    def crawl_year_daily_peninfo_json(self, data):
        """行政处罚"""
        params = {'id': data.get('id'), 'jtype': 'peninfo'}
        json_data = self.reqst.get(ChongqingClawer.urls['year_daily_peninfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_year_daily_peninfo = json_data[6:]
            # print(self.json_year_daily_peninfo)

    def crawl_sfxzgdbg_json(self, data):
        """股东变更信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingClawer.urls['sfxzgdbg_page'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_sfxzgdbg = json_data[6:]
            # print(self.json_sfxzgdbg)

    def crawl_sfxz_json(self, data):
        """股权冻结信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'type': 1}
        json_data = self.reqst.get(ChongqingClawer.urls['sfxz_page'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_sfxz = json_data[6:]
            # print(self.json_sfxz)

    def crawl_other_qlicinfo_json(self, data):
        """股东出资信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qlicinfo', 'type': 1}
        json_data = self.reqst.get(ChongqingClawer.urls['other_qlicinfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_other_qlicinfo = json_data[6:]
            # print(self.json_other_qlicinfo)

    def crawl_other_qpeninfo_json(self, data):
        """股东出资信息"""
        params = {'entId': data.get('entId'), 'id': data.get('id'), 'qtype': 'Qpeninfo', 'type': 1}
        json_data = self.reqst.get(ChongqingClawer.urls['other_qpeninfo'], params=params)
        if json_data.status_code == 200:
            # 此页面响应结果直接就是 json_data
            json_data = json_data.content
            json_data = str(json_data)
            self.json_other_qpeninfo = json_data[6:]

Пример #27

Показать файл

class BeijingCrawler(Crawler):
    """北京工商爬虫
    """
    code_cracker = CaptchaRecognition('beijing')
    #多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'http://qyxy.baic.gov.cn',
        'official_site':
        'http://qyxy.baic.gov.cn/beijing',
        'get_checkcode':
        'http://qyxy.baic.gov.cn',
        'post_checkcode':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!checkCode.dhtml',
        'open_info_entry':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!getBjQyList.dhtml',
        'ind_comm_pub_reg_basic':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!openEntInfo.dhtml?',
        'ind_comm_pub_reg_shareholder':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!tzrFrame.dhtml?',
        'ind_comm_pub_reg_modify':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!biangengFrame.dhtml?',
        'ind_comm_pub_arch_key_persons':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!zyryFrame.dhtml?',
        'ind_comm_pub_arch_branch':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!fzjgFrame.dhtml?',
        'ind_comm_pub_arch_liquidation':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!qsxxFrame.dhtml?',
        'ind_comm_pub_movable_property_reg':
        'http://qyxy.baic.gov.cn/gjjbjTab/gjjTabQueryCreditAction!dcdyFrame.dhtml?',
        'ind_comm_pub_equity_ownership_reg':
        'http://qyxy.baic.gov.cn/gdczdj/gdczdjAction!gdczdjFrame.dhtml?',
        'ind_comm_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list.dhtml?',
        'ind_comm_pub_business_exception':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_jyycxx.dhtml?',
        'ind_comm_pub_serious_violate_law':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_yzwfxx.dhtml?',
        'ind_comm_pub_spot_check':
        'http://qyxy.baic.gov.cn/gsgs/gsxzcfAction!list_ccjcxx.dhtml?',
        'ent_pub_ent_annual_report':
        'http://qyxy.baic.gov.cn/qynb/entinfoAction!qyxx.dhtml?',
        'ent_pub_shareholder_capital_contribution':
        'http://qyxy.baic.gov.cn/gdcz/gdczAction!list_index.dhtml?',
        'ent_pub_equity_change':
        'http://qyxy.baic.gov.cn/gdgq/gdgqAction!gdgqzrxxFrame.dhtml?',
        'ent_pub_administration_license':
        'http://qyxy.baic.gov.cn/xzxk/xzxkAction!list_index.dhtml?',
        'ent_pub_knowledge_property':
        'http://qyxy.baic.gov.cn/zscqczdj/zscqczdjAction!list_index.dhtml?',
        'ent_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/gdgq/gdgqAction!qyxzcfFrame.dhtml?',
        'other_dept_pub_administration_license':
        'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzxk.dhtml?',
        'other_dept_pub_administration_sanction':
        'http://qyxy.baic.gov.cn/qtbm/qtbmAction!list_xzcf.dhtml?',
        'shareholder_detail':
        'http://qyxy.baic.gov.cn/gjjbj/gjjQueryCreditAction!touzirenInfo.dhtml?'
    }

    def __init__(self, json_restore_path=None):
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        html_restore_path = self.json_restore_path + '/beijing/'

        #验证码图片的存储路径
        ckcode_image_path = self.json_restore_path + '/beijing/ckcode.jpg'
        self.parser = BeijingParser(self)
        self.credit_ticket = None
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        self.timeout = 20

    def run(self, ent_number):
        """爬取的主函数
        """
        self.ent_id = ''
        return Crawler.run(self, ent_number)

    def crawl_page_by_url(self, url):
        resp = None
        try:
            resp = self.reqst.get(url,
                                  timeout=self.timeout,
                                  proxies=self.proxies)
        except requests.exceptions.ConnectionError:
            self.proxies = Proxies().get_proxies()
            logging.error("get method self.proxies changed proxies = %s\n" %
                          (self.proxies))
            return self.crawl_page_by_url(url)
        except requests.exceptions.Timeout:
            self.timeout += 5
            logging.error(
                "get method self.timeout plus timeout = %d, proxies= %s\n" %
                (self.timeout, self.proxies))
            return self.crawl_page_by_url(url)
        except Exception as e:
            logging.error("Other exception occured!type e = %s, proxies=%s\n" %
                          (type(e), self.proxies))
        return resp

    def crawl_page_by_url_post(self, url, data):
        resp = None
        try:
            resp = self.reqst.post(url,
                                   data,
                                   timeout=self.timeout,
                                   proxies=self.proxies)
        except requests.exceptions.ConnectionError:
            self.proxies = Proxies().get_proxies()
            logging.error("post method self.proxies changed. proxies =  %s\n" %
                          (self.proxies))
            return self.crawl_page_by_url_post(url, data)
        except requests.exceptions.Timeout:
            self.timeout += 5
            logging.error(
                "post method self.timeout plus, timeout= %d, proxies= %s\n" %
                (self.timeout, self.proxies))
            return self.crawl_page_by_url_post(url, data)
        except Exception as e:
            logging.error(
                "Other exception occured!type e = %s, proxies=%s \n" %
                (type(e), self.proxies))
        return resp

    def crawl_check_page(self):
        """爬取验证码页面，包括获取验证码url，下载验证码图片，破解验证码并提交
        """
        resp = self.crawl_page_by_url(self.urls['official_site'])
        if resp.status_code != 200:
            logging.error('failed to get official site page!')
            return False
        count = 0
        while count < 15:
            count += 1
            ckcode = self.crack_checkcode()
            if not ckcode[1]:
                logging.error(
                    'failed to get crackcode result, fail count = %d' %
                    (count))
                continue

            post_data = {
                'currentTimeMillis': self.time_stamp,
                'credit_ticket': self.credit_ticket,
                'checkcode': ckcode[1],
                'keyword': self.ent_number
            }
            next_url = self.urls['post_checkcode']
            resp = self.crawl_page_by_url_post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to get crackcode image by url %s, fail count = %d'
                    % (next_url, count))
                continue

            logging.error('crack code = %s, %s, response =  %s' %
                          (ckcode[0], ckcode[1], resp.content))

            if resp.content == 'fail':
                logging.error(
                    'crack checkcode failed, response content = failed, total fail count = %d'
                    % count)
                time.sleep(random.uniform(0.1, 2))
                continue

            next_url = self.urls['open_info_entry']
            resp = self.crawl_page_by_url_post(next_url, data=post_data)
            if resp.status_code != 200:
                logging.error(
                    'failed to open info entry by url %s, fail count = %d' %
                    (next_url, count))
                continue

            crack_result = self.parse_post_check_page(resp.content)
            if crack_result:
                return True
            else:
                logging.error('crack checkcode failed, total fail count = %d' %
                              count)
            time.sleep(random.uniform(3, 5))
        return False

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息页面
        """
        for item in (
                'ind_comm_pub_reg_basic',  # 登记信息-基本信息
                'ind_comm_pub_reg_shareholder',  # 股东信息
                'ind_comm_pub_reg_modify',
                'ind_comm_pub_arch_key_persons',  # 备案信息-主要人员信息
                'ind_comm_pub_arch_branch',  # 备案信息-分支机构信息
                'ind_comm_pub_arch_liquidation',  # 备案信息-清算信息
                'ind_comm_pub_movable_property_reg',  # 动产抵押登记信息
                'ind_comm_pub_equity_ownership_reg',  # 股权出置登记信息
                'ind_comm_pub_administration_sanction',  # 行政处罚信息
                'ind_comm_pub_business_exception',  # 经营异常信息
                'ind_comm_pub_serious_violate_law',  # 严重违法信息
                'ind_comm_pub_spot_check'  # 抽查检查信息
        ):
            self.get_page_json_data(item, 1)
        time.sleep(random.uniform(0, 3))

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息页面
        """
        for item in (
                'ent_pub_ent_annual_report',
                'ent_pub_shareholder_capital_contribution',  #企业投资人出资比例
                'ent_pub_equity_change',  #股权变更信息
                'ent_pub_administration_license',  #行政许可信息
                'ent_pub_knowledge_property',  #知识产权出资登记
                'ent_pub_administration_sanction'  #行政许可信息
        ):
            self.get_page_json_data(item, 2)
        time.sleep(random.uniform(0, 3))

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息页面
        """
        for item in (
                'other_dept_pub_administration_license',  #行政许可信息
                'other_dept_pub_administration_sanction'  #行政处罚信息
        ):
            self.get_page_json_data(item, 3)

    def crawl_judical_assist_pub_pages(self):
        """爬取司法协助信息页面
        """
        pass

    def get_page_json_data(self, page_name, page_type):
        """获得页面的解析后的json格式数据
        Args:
            page_name: 页面名称
            page_type: 页面类型, 1 工商公示页面， 2 企业公示页面， 3 其他部门公示页面
        """
        page = self.get_page(page_name, page_type)
        pages = self.get_all_pages_of_a_section(page, page_name)
        if len(pages) == 1:
            self.json_dict[page_name] = {}
            json_data = self.parser.parse_page(page, page_name)
            if json_data:
                self.json_dict[page_name] = json_data
        else:
            self.json_dict[page_name] = []
            for p in pages:
                json_data = self.parser.parse_page(p, page_name)
                if json_data:
                    self.json_dict[page_name] += json_data

    def get_checkcode_url(self):
        count = 0
        while count < 5:
            count += 1
            resp = self.crawl_page_by_url(self.urls['official_site'])
            time.sleep(random.uniform(1, 5))
            if resp.status_code != 200:
                logging.error('failed to get crackcode url')
                continue
            response = resp.content
            soup = BeautifulSoup(response, 'html.parser')
            ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src')
            ckimg_src = str(ckimg_src)
            re_checkcode_captcha = re.compile(r'/([\s\S]*)\?currentTimeMillis')
            # re_currenttime_millis=re.compile(r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)')
            checkcode_type = re_checkcode_captcha.findall(ckimg_src)[0]

            if checkcode_type == 'CheckCodeCaptcha':
                #parse the pre check page, get useful information
                self.parse_pre_check_page(response)
                checkcode_url = self.urls['get_checkcode'] + ckimg_src
                return checkcode_url

            # elif checkcode_type == 'CheckCodeYunSuan':
            logging.error(
                'can not get CheckCodeCaptcha type of checkcode img, count times = %d \n'
                % (count))
        return None

    def parse_post_check_page(self, page):
        """解析提交验证码之后的页面，获取必要的信息
        """
        if page == 'fail':
            logging.error('checkcode error!')
            # if senting_open:
            #     senting_client.captureMessage('checkcode error!')
            return False

        soup = BeautifulSoup(page, 'html.parser')
        r = soup.find_all('a', {
            'href': "#",
            'onclick': re.compile(r'openEntInfo')
        })

        ent = ''
        if r:
            ent = r[0]['onclick']
        else:
            logging.error('fail to find openEntInfo')
            return False

        m = re.search(r'\'([\w]*)\'[ ,]+\'([\w]*)\'[ ,]+\'([\w]*)\'', ent)
        if m:
            self.ent_id = m.group(1)
            self.credit_ticket = m.group(3)

        r = soup.find_all(
            'input', {
                'type': "hidden",
                'name': "currentTimeMillis",
                'id': "currentTimeMillis"
            })
        if r:
            self.time_stamp = r[0]['value']
        else:
            logging.error('fail to get time stamp')
        return True

    def parse_pre_check_page(self, page):
        """解析提交验证码之前的页面
        """
        soup = BeautifulSoup(page, 'html.parser')
        ckimg_src = soup.find_all('img', id='MzImgExpPwd')[0].get('src')
        ckimg_src = str(ckimg_src)
        re_currenttime_millis = re.compile(
            r'/CheckCodeCaptcha\?currentTimeMillis=([\s\S]*)')
        self.credit_ticket = soup.find_all('input',
                                           id='credit_ticket')[0].get('value')
        self.time_stamp = re_currenttime_millis.findall(ckimg_src)[0]
        # self.time_stamp = self.generate_time_stamp()

    """
    def crawl_page_by_url(self, url):
        resp = self.crawl_page_by_url(url)
        if resp.status_code != 200:
            logging.error('failed to crawl page by url' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        # if saveingtml:
        #     CrawlerUtils.save_page_to_file(self.html_restore_path + 'detail.html', page)
        return page
    """

    def get_all_pages_of_a_section(self, page, type, url=None):
        """获取页面上含有 上一页、下一页跳转链接的区域的所有的数据
        Args:
            page: 已经爬取的页面
            type: 页面类型
            url: 该页面的url，默认为None，因为一般可以通过 type 从 BeijingCrawler.urls 中找到
        Returns:
            pages: 所有页面的列表
        """
        if not page:
            return page
        soup = BeautifulSoup(page, 'html.parser')
        page_count = 0
        page_size = 0
        pages_data = []
        pages_data.append(page)
        r1 = soup.find_all('input', {'type': 'hidden', 'id': 'pagescount'})
        r2 = soup.find_all('input', {
            'type': 'hidden',
            'id': 'pageSize',
            'name': 'pageSize'
        })
        if r1 and r2:
            page_count = int(r1[0].get('value'))
            page_size = int(r2[0].get('value'))
        else:
            #只有一页
            return pages_data

        if page_count <= 1:
            return pages_data

        if not url:
            next_url = self.urls[type].rstrip('?')
        else:
            next_url = url

        for p in range(1, page_count):
            post_data = {
                'pageNos': str(p + 1),
                'clear': '',
                'pageNo': str(p),
                'pageSize': str(page_size),
                'ent_id': self.ent_id
            }
            try:
                resp = self.crawl_page_by_url_post(next_url, data=post_data)
                if resp.status_code != 200:
                    logging.error('failed to get all page of a section')
                    return pages_data
                page = resp.content
                time.sleep(random.uniform(0.2, 1))
            except Exception as e:
                logging.error(
                    'open new tab page failed, url = %s, page_num = %d' %
                    (next_url, p + 1))
                page = None
                raise e
            finally:
                if page:
                    pages_data.append(page)
        return pages_data

    def get_page(self, type, tab):
        """获取页面，为了简便，在url后面添加了所有可能用到的数据，即使有多余的参数也不影响
        Args:
            tab: 访问页面时在url后面所用到的数据。1 工商公示信息， 2 企业公示信息， 3 其他部门公示信息
        """
        url = CrawlerUtils.add_params_to_url(
            self.urls[type], {
                'entId': self.ent_id,
                'ent_id': self.ent_id,
                'entid': self.ent_id,
                'credit_ticket': self.credit_ticket,
                'entNo': self.ent_number,
                'entName': '',
                'timeStamp': self.generate_time_stamp(),
                'clear': 'true',
                'str': tab
            })
        logging.error('get %s, url:\n%s\n' % (type, url))
        resp = self.crawl_page_by_url(url)
        if resp.status_code != 200:
            logging.error('get page failed by url %s' % url)
            return
        page = resp.content
        time.sleep(random.uniform(0.2, 1))
        return page

    def crack_checkcode(self):
        """破解验证码"""
        ckcode = ('', '')
        checkcode_url = self.get_checkcode_url()
        if checkcode_url == None:
            return ckcode
        resp = self.crawl_page_by_url(checkcode_url)
        if resp.status_code != 200:
            logging.error('failed to get checkcode img')
            return ckcode
        page = resp.content
        time.sleep(random.uniform(1, 2))
        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(page)
        if not self.code_cracker:
            logging.error('invalid code cracker\n')
            return ckcode
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.error('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()
        return ckcode

    def generate_time_stamp(self):
        """生成时间戳
        """
        return int(time.time())

Пример #28

Показать файл

Файл: sichuan_crawler.py Проект: xiaohui2856/crawl

class SichuanCrawler(object):
    """ 四川爬虫， 继承object， 验证码与陕西一致。"""
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path=None):
        self.pripid = None
        self.cur_time = str(int(time.time() * 1000))
        self.reqst = requests.Session()
        self.reqst.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/sichuan/'
        self.code_cracker = CaptchaRecognition('sichuan')
        self.result_json_dict = {}
        self.json_list = []

        proxies = get_proxy('shaanxi')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.ents = {}

        self.mydict = {
            'eareName': 'http://www.ahcredit.gov.cn',
            'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=',
            'searchList':
            'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=',
            'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm'
        }

        self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic',
                         u'股东信息': 'ind_comm_pub_reg_shareholder',
                         u'发起人信息': 'ind_comm_pub_reg_shareholder',
                         u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
                         u'变更信息': 'ind_comm_pub_reg_modify',
                         u'主要人员信息': 'ind_comm_pub_arch_key_persons',
                         u'分支机构信息': 'ind_comm_pub_arch_branch',
                         u'清算信息': 'ind_comm_pub_arch_liquidation',
                         u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
                         u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'行政处罚信息': 'ind_comm_pub_administration_sanction',
                         u'经营异常信息': 'ind_comm_pub_business_exception',
                         u'严重违法信息': 'ind_comm_pub_serious_violate_law',
                         u'抽查检查信息': 'ind_comm_pub_spot_check'}

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license',
                           u'行政处罚信息': 'other_dept_pub_administration_sanction'}
        self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'股东变更信息': 'judical_assist_pub_shareholder_modify',
                          u'司法股东变更登记信息':
                          'judical_assist_pub_shareholder_modify'}

    def get_check_num(self):
        # print self.mydict['search']+self.cur_time
        resp = self.reqst.get(self.mydict['search'] + self.cur_time, timeout=self.timeout)
        if resp.status_code != 200:
            # print resp.status_code
            return None
        # print BeautifulSoup(resp.content).prettify
        resp = self.reqst.get(self.mydict['validateCode'] + '&dt=%s&random=%s' % (self.cur_time, self.cur_time),
                              timeout=self.timeout)
        if resp.status_code != 200:
            # print 'no validateCode'
            return None
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)

        ck_code = self.code_cracker.predict_result(self.ckcode_image_path)
        if ck_code is None:
            return None
        else:
            return ck_code[1]

    def analyze_showInfo(self, page):
        soup = BeautifulSoup(page, 'html.parser')
        divs = soup.find_all('div',
                             attrs={
                                 "style":
                                 "width:950px; padding:25px 20px 0px; overflow: hidden;float: left;"
                             })
        if divs:
            try:
                Ent = {}
                count = 0
                for div in divs:
                    count += 1
                    link = div.find('li')
                    url = ""
                    if link and link.find('a') and link.find('a').has_attr('onclick'):
                        url = link.find('a')['onclick']
                    ent = ""
                    profile = link.find_next_sibling()
                    if profile and profile.span:
                        ent = profile.span.get_text().strip()
                    name = link.find('a').get_text().strip()
                    if self.ent_num == name or self.ent_num == ent:
                        Ent.clear()
                        Ent[ent] = url
                        break
                    if count == 3:
                        break
                if not Ent:
                    return False
                self.ents = Ent
                return True

            except:
                logging.error(u"%s" % (traceback.format_exc(10)))
        return False

    def get_id_num(self, findCode):
        count = 0
        while count < 20:
            yzm = self.get_check_num()
            print yzm
            count += 1
            if yzm is None:
                continue

            data = {'currentPageNo': '1', 'yzm': yzm, 'cxym': "cxlist", 'maent.entname': findCode}
            resp = self.reqst.post(self.mydict['searchList'] + self.cur_time, data=data, timeout=self.timeout)
            if self.analyze_showInfo(resp.content):
                return True
            print "crawl %s times:%d" % (findCode, count)
            time.sleep(random.uniform(1, 4))
        return False

    def help_dcdy_get_dict(self, method, maent_pripid, maent_xh, random):
        data = {'method': method, 'maent.pripid': maent_pripid, 'maent.xh': maent_xh, 'random': random}
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
        needdict = {}
        for table in BeautifulSoup(resp.content, 'html.parser').find_all('table'):
            dcdy_head, dcdy_allths, dcdy_alltds = self.get_head_ths_tds(table)
            needdict[dcdy_head] = self.get_one_to_one_dict(dcdy_allths, dcdy_alltds)
        return needdict

    def help_enter_get_dict(self, method, maent_pripid, year, random):
        data = {'method': method, 'maent.pripid': maent_pripid, 'maent.nd': year, 'random': random}
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
        #print resp.status_code
        #print BeautifulSoup(resp.content).prettify
        needdict = {}
        for i, table in enumerate(BeautifulSoup(resp.content, 'html.parser').find_all('table')):
            enter_head, enter_allths, enter_alltds = self.get_head_ths_tds(table)
            if i == 0:
                try:
                    enter_head = enter_allths[0]
                    enter_allths = enter_allths[1:]
                except:
                    enter_head = u'企业基本信息'
                    enter_allths = [u'注册号/统一社会信用代码', u'企业名称', u'企业联系电话', u'邮政编码', \
                        u'企业通信地址', u'企业电子邮箱', u'有限责任公司本年度是否发生股东股权转让', u'企业经营状态', \
                        u'是否有网站或网店', u'是否有投资信息或购买其他公司股权', u'从业人数']
            if enter_head == u'股东及出资信息':
                enter_allths = [u'股东', u'认缴出资额（万元）', u'认缴出资时间', u'认缴出资方式', u'实缴出资额（万元）', u'出资时间', u'出资方式']
            #self.test_print_all_ths_tds(enter_head, enter_allths, enter_alltds)
            needdict[enter_head] = self.get_one_to_one_dict(enter_allths, enter_alltds)
            if enter_head == u'企业基本信息' or enter_head == u'企业资产状况信息':
                needdict[enter_head] = self.get_one_to_one_dict(enter_allths, enter_alltds)[0]
        return needdict

    def help_detail_get_dict(self, method, maent_xh, maent_pripid, random):
        data = {'method': method, 'maent.xh': maent_xh, 'maent.pripid': maent_pripid, 'random': random}
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
        # print resp.status_code
        # print BeautifulSoup(resp.content).prettify
        for table in BeautifulSoup(resp.content, 'html.parser').find_all('table'):
            if table.find_all('th') and table.find_all('th')[0].get_text().strip() == u'股东及出资信息':
                #print table
                detail_head, detail_allths, detail_alltds = self.get_head_ths_tds(table)
                # self.test_print_all_ths_tds(detail_head, detail_allths, detail_alltds)
                tempdict = {}
                for key, value in zip(detail_allths[:3], detail_alltds[:3]):
                    tempdict[key] = value
                onelist_dict = {}
                for key, value in zip(detail_allths[3:], detail_alltds[3:]):
                    onelist_dict[key] = value.split('\n')[-1] if value else None
                tempdict['list'] = [onelist_dict]
                return {u'股东及出资信息': [tempdict]}
                break

    def get_head_ths_tds(self, table):
        # print table
        try:
            head = table.find_all('th')[0].get_text().strip().split('\n')[0].strip()
        except:
            head = None
            pass
        allths = [th.get_text().strip() for th in table.find_all('th')[1:] if th.get_text()]
        for i, th in enumerate(allths):
            if th[:2] == '<<' or th[-2:] == '>>':
                allths = allths[:i]
                break
        alltds = [td.get_text().strip() if td.get_text() else None for td in table.find_all('td')]
        if head == u'变更信息' or head == u'修改记录' or head == u'行政处罚信息':
            alltds = []
            for td in table.find_all('td'):
                if td.get_text():
                    if len(td.find_all('span')) > 1:
                        alltds.append(td.find_all('span')[1].get_text().strip().split('\n')[0].strip())
                    else:
                        alltds.append(td.get_text().strip())
                else:
                    alltds.append(None)

        if head == u'主要人员信息':
            allths = allths[:int(len(allths) / 2)]
        if head == u'股东及出资信息':
            allths = allths[:3] + allths[5:]
        if head == u'股东信息':
            alltds = []
            for td in table.find_all('td'):
                if td.find('a'):
                    onclick = td.a['onclick']
                    m = re.search(r"showRyxx\(\'(\w+?)\',\'(\w+?)\'\)", onclick)
                    if m:
                        maent_xh = m.group(1)
                        maent_pripid = m.group(2)
                        #print 'maent_xh',':', maent_xh,'maent_pripid',':',maent_pripid
                        #print self.help_detail_get_dict('tzrCzxxDetial',maent_xh, maent_pripid, self.cur_time)
                        alltds.append(self.help_detail_get_dict('tzrCzxxDetial', maent_xh, maent_pripid, self.cur_time))
                elif td.get_text():
                    alltds.append(td.get_text().strip())
                else:
                    alltds.append(None)
        if head == u'企业年报':
            alltds = []
            for td in table.find_all('td'):
                if td.find('a'):
                    onclick = td.a['onclick']
                    m = re.search(r'doNdbg\(\'(\w+)\'\)', onclick)
                    if m:
                        alltds.append(td.get_text().strip())
                        alltds.append(self.help_enter_get_dict('ndbgDetail', self.pripid, m.group(1), self.cur_time))
                elif td.get_text():
                    alltds.append(td.get_text().strip())
                else:
                    alltds.append(None)
            allths.insert(2, u'详情')
        if head == u'动产抵押登记信息':
            alltds = []
            for td in table.find_all('td'):
                if td.find('a'):
                    onclick = td.a['onclick']
                    m = re.search(r'doDcdyDetail\(\'(\w+?)\'\)', onclick)
                    if m:
                        alltds.append(self.help_dcdy_get_dict('dcdyDetail', self.pripid, m.group(1), self.cur_time))
                elif td.get_text():
                    alltds.append(td.get_text().strip())
                else:
                    alltds.append(None)
        # if len(alltds) == 0:
        # 	alltds = [None for th in allths]
        return head, allths, alltds

    def get_one_to_one_dict(self, allths, alltds):
        if len(allths) == len(alltds):
            one_to_one_dict = {}
            for key, value in zip(allths, alltds):
                one_to_one_dict[key] = value
            return [one_to_one_dict]
        else:
            templist = []
            x = 0
            y = x + len(allths)
            while y <= len(alltds):
                tempdict = {}
                for keys, values in zip(allths, alltds[x:y]):
                    tempdict[keys] = values
                x = y
                y = x + len(allths)
                templist.append(tempdict)
            return templist

    def test_print_table(self, tables):
        for table in tables:
            print table

    def test_print_all_ths_tds(self, head, allths, alltds):
        print '--------------', head, '--------------'
        for th in allths:
            print th
        for td in alltds:
            print td

    def test_print_all_dict(self, mydict):
        for key, value in mydict.items():
            print key, ':', value

    def get_table_by_head(self, tables, head_item):
        for table in tables:
            if table.find_all('th'):
                temp_head = table.find_all('th')[0].get_text().strip().split('\n')[0].strip()
                #print 'temp_head', temp_head, 'head_item', head_item
                if temp_head == head_item:
                    return table
        # else:
        # 	print 'no'*10
        pass

    def get_json_one(self, mydict, tables, *param):
        #self.test_print_table(tables)
        for head_item in param:
            #print '----'*10, head_item
            table = self.get_table_by_head(tables, head_item)
            if table:
                head, allths, alltds = self.get_head_ths_tds(table)
                #self.test_print_all_ths_tds(head, allths, alltds)
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds)
        pass

    def get_json_two(self, mydict, tables):
        #self.test_print_table(tables)
        for head_item in param:
            #print '----'*10, head_item
            table = self.get_table_by_head(tables, head_item)
            if table:
                head, allths, alltds = self.get_head_ths_tds(table)
                #self.test_print_all_ths_tds(head, allths, alltds)
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds)

        pass

    def get_json_three(self, mydict, tables):
        #self.test_print_table(tables)
        for head_item in param:
            #print '----'*10, head_item
            table = self.get_table_by_head(tables, head_item)
            if table:
                head, allths, alltds = self.get_head_ths_tds(table)
                #self.test_print_all_ths_tds(head, allths, alltds)
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds)

        pass

    def get_json_four(self, mydict, tables):
        #self.test_print_table(tables)
        for head_item in param:
            #print '----'*10, head_item
            table = self.get_table_by_head(tables, head_item)
            if table:
                head, allths, alltds = self.get_head_ths_tds(table)
                #self.test_print_all_ths_tds(head, allths, alltds)
                self.result_json_dict[mydict[head]] = self.get_one_to_one_dict(allths, alltds)
        pass

    def main_page(self):
        gevent.monkey.patch_socket()
        sub_json_list = []
        for ent, url in self.ents.items():
            m = re.search(r"openView\(\'(\w+?)\'", url)
            if m:
                self.pripid = m.group(1)
            self.result_json_dict = {}
            print self.pripid

            def qyInfo():
                data = {'method': 'qyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk1', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'基本信息',
                                  u'股东信息', u'变更信息')

            def baInfo():
                data = {'method': 'baInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk2', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'主要人员信息', u'分支机构信息', u'清算信息')

            def dcdyInfo():
                data = {'method': 'dcdyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'动产抵押登记信息')

            def gqczxxInfo():
                data = {'method': 'gqczxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'股权出质登记信息')

            def jyycInfo():
                data = {'method': 'jyycInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk6', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'经营异常信息')

            def yzwfInfo():
                data = {'method': 'yzwfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk14', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'严重违法信息')

            def cfInfo():
                data = {'method': 'cfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk3', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'行政处罚信息')

            def ccjcInfo():
                data = {'method': 'ccjcInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk7', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.one_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'抽查检查信息')

            def qygsInfo():
                data = {'method': 'qygsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk8', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'), u'企业年报')

            def qygsForTzrxxInfo():
                data = {'method': 'qygsForTzrxxInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk12',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'股东及出资信息', u'变更信息')

            def cqygsForTzrbgxxInfo():
                data = {'method': 'cqygsForTzrbgxxInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk15',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'股权变更信息')

            def qygsForXzxkInfo():
                data = {'method': 'qygsForXzxkInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk10',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'行政许可信息')

            def qygsForZzcqInfo():
                data = {'method': 'qygsForZzcqInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk11',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'知识产权出质登记信息')

            def qygsForXzcfInfo():
                data = {'method': 'qygsForXzcfInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk13',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.two_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'行政处罚信息')

            def qtgsInfo():
                data = {'method': 'qtgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk9', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.three_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'行政许可信息')

            def qtgsForCfInfo():
                data = {'method': 'qtgsForCfInfo',
                        'maent.pripid': self.pripid,
                        'czmk': 'czmk16',
                        'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.three_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'行政处罚信息')

            def sfgsInfo():
                data = {'method': 'sfgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk17', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.four_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'司法股权冻结信息')

            def sfgsbgInfo():
                data = {'method': 'sfgsbgInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk18', 'random': self.cur_time}
                resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=self.timeout)
                self.get_json_one(self.four_dict, BeautifulSoup(resp.content, 'html.parser').find_all('table'),
                                  u'司法股东变更登记信息')

            threads = []
            threads.append(gevent.spawn(qyInfo))
            threads.append(gevent.spawn(baInfo))
            threads.append(gevent.spawn(dcdyInfo))
            threads.append(gevent.spawn(gqczxxInfo))
            threads.append(gevent.spawn(jyycInfo))
            threads.append(gevent.spawn(yzwfInfo))
            threads.append(gevent.spawn(cfInfo))
            threads.append(gevent.spawn(ccjcInfo))
            threads.append(gevent.spawn(qygsInfo))
            threads.append(gevent.spawn(qygsForTzrxxInfo))
            threads.append(gevent.spawn(cqygsForTzrbgxxInfo))
            threads.append(gevent.spawn(qygsForXzxkInfo))
            threads.append(gevent.spawn(qygsForZzcqInfo))
            threads.append(gevent.spawn(qygsForXzcfInfo))
            threads.append(gevent.spawn(qtgsInfo))
            threads.append(gevent.spawn(qtgsForCfInfo))
            threads.append(gevent.spawn(sfgsInfo))
            threads.append(gevent.spawn(sfgsbgInfo))

            gevent.joinall(threads)
            self.result_json_dict['ind_comm_pub_reg_basic'] = self.result_json_dict['ind_comm_pub_reg_basic'][0]
            if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys() and len(self.result_json_dict[
                    'ind_comm_pub_arch_liquidation']) > 0:
                self.result_json_dict['ind_comm_pub_arch_liquidation'] = self.result_json_dict[
                    'ind_comm_pub_arch_liquidation'][0]
            sub_json_list.append({ent: self.result_json_dict})
        return sub_json_list

    def run(self, findCode):
        print self.__class__.__name__
        logging.error('crawl %s .', self.__class__.__name__)
        self.ent_num = str(findCode)
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

        if not self.get_id_num(self.ent_num):
            return json.dumps([{self.ent_num: None}])

        data = self.main_page()
        return json.dumps(data)

Пример #29

Показать файл

class HeilongjiangClawer(Crawler):
    """黑龙江工商公示信息网页爬虫
    """
    # html数据的存储路径
    html_restore_path = settings.json_restore_path + '/heilongjiang/'

    # 验证码图片的存储路径
    ckcode_image_path = settings.json_restore_path + '/heilongjiang/ckcode.jpg'
    code_cracker = CaptchaRecognition('heilongjiang')
    # 多线程爬取时往最后的json文件中写时的加锁保护
    write_file_mutex = threading.Lock()

    urls = {
        'host':
        'www.hljaic.gov.cn',
        'get_checkcode':
        'http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0',
        'post_checkcode':
        'http://gsxt.hljaic.gov.cn/checkCheckNo.jspx',
        'get_info_entry':
        'http://gsxt.hljaic.gov.cn/searchList.jspx',
        'ind_comm_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=',
        'ent_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=',
        'other_dept_pub_skeleton':
        'http://gsxt.hljaic.gov.cn/otherDepartment.jspx?id=',
        'judical_assist_skeleton':
        'http://gsxt.hljaic.gov.cn/justiceAssistance.jspx?id=',
        'ind_comm_pub_reg_shareholder':
        'http://gsxt.hljaic.gov.cn/QueryInvList.jspx?',  # 股东信息
        'ind_comm_pub_reg_modify':
        'http://gsxt.hljaic.gov.cn/QueryAltList.jspx?',  # 变更信息翻页
        'ind_comm_pub_arch_key_persons':
        'http://gsxt.hljaic.gov.cn/QueryMemList.jspx?',  # 主要人员信息翻页
        'ind_comm_pub_spot_check':
        'http://gsxt.hljaic.gov.cn/QuerySpotCheckList.jspx?',  # 抽样检查信息翻页
        'ind_comm_pub_movable_property_reg':
        'http://gsxt.hljaic.gov.cn/QueryMortList.jspx?',  # 动产抵押登记信息翻页
        'ind_comm_pub_business_exception':
        'http://gsxt.hljaic.gov.cn/QueryExcList.jspx?',  # 经营异常信息
        'shareholder_detail':
        'http://gsxt.hljaic.gov.cn/queryInvDetailAction.jspx?id=',  # 投资人详情
        'movable_property_reg_detail':
        'http://gsxt.hljaic.gov.cn/mortInfoDetail.jspx?id=',  # 动产抵押登记详情
        'annual_report':
        'http://gsxt.hljaic.gov.cn/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        self.json_restore_path = json_restore_path
        self.parser = HeilongjiangParser(self)
        self.img_count = 1
        if not os.path.exists(self.html_restore_path):
            os.makedirs(self.html_restore_path)

    def run(self, ent_number=0):
        """爬取的主函数
        """
        return Crawler.run(self, ent_number)
        # return super(HeilongjiangClawer, self).run(ent_number)

    def crawl_check_page(self):
        """爬取验证码页面，包括下载验证码图片以及破解验证码
        :return true or false
        """
        count = 0
        while count < 10:
            ck_code = self.crack_check_code()

            data = {'checkNo': ck_code}
            resp = self.reqst.post(self.urls['post_checkcode'], data=data)

            if resp.status_code != 200:
                logging.error("crawl post check page failed!")
                count += 1
                continue
            if resp.content[10] == 't':
                data = {'checkNo': ck_code, 'entName': self.ent_number}
                resp = self.reqst.post(self.urls['get_info_entry'], data=data)
                soup = BeautifulSoup(resp.text, "html5lib")
                div = soup.find("div", {"style": "height:500px;"})
                a = div.find("a")
                if a:
                    company_id = a["href"].split('?')[1]
                    self.company_id = company_id.split("=")[1]
                    return True
                else:
                    return False
            else:
                logging.error("crawl post check page failed!")
                count += 1
                continue
        return False

    def crack_check_code(self):
        """破解验证码
        :return 破解后的验证码
        """
        resp = self.reqst.get(self.urls['get_checkcode'])
        if resp.status_code != 200:
            logging.error('failed to get get_checkcode')
            return None

        time.sleep(random.uniform(2, 4))

        self.write_file_mutex.acquire()
        with open(self.ckcode_image_path, 'wb') as f:
            f.write(resp.content)
        try:
            ckcode = self.code_cracker.predict_result(self.ckcode_image_path)
        except Exception as e:
            logging.warn('exception occured when crack checkcode')
            ckcode = ('', '')
        finally:
            pass
        self.write_file_mutex.release()

        return ckcode[1]

    def crawl_ind_comm_pub_pages(self):
        """爬取工商公示信息
        """
        url = "%s%s" % (self.urls['ind_comm_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get ind_comm_pub_skeleton')
        self.parser.parse_ind_comm_pub_pages(resp.content)

    def crawl_ent_pub_pages(self):
        """爬取企业公示信息
        """
        url = "%s%s" % (self.urls['ent_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get ent_pub_skeleton')
        self.parser.parse_ent_pub_pages(resp.content)

    def crawl_other_dept_pub_pages(self):
        """爬取其他部门公示信息
        """
        url = "%s%s" % (self.urls['other_dept_pub_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get other_dept_pub_skeleton')
        self.parser.crawl_other_dept_pub_pages(resp.content)

    def crawl_judical_assist_pub_pages(self):
        """爬取司法协助信息
        """
        url = "%s%s" % (self.urls['judical_assist_skeleton'], self.company_id)
        resp = self.reqst.get(url)
        if resp.status_code != 200:
            logging.error('failed to get judical_assist_skeleton')
        self.parser.parse_judical_assist_pub_pages(resp.content)

Пример #30

Показать файл

Файл: guangxi_crawler.py Проект: xiaohui2856/clawer

    def __init__(self, json_restore_path):
        # self.cur_time = str(int(time.time()*1000))
        self.id = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = settings.json_restore_path + '/guangxi/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guangxi')

        self.result_json_dict = {}
        self.reqst.headers.update({
            'Accept':
            'text/html, application/xhtml+xml, */*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:39.0) Gecko/20100101 Firefox/39.0'
        })

        self.mydict = {
            'eareName':
            'http://gxqyxygs.gov.cn',
            'search':
            'http://gxqyxygs.gov.cn/search.jspx',
            'searchList':
            'http://gxqyxygs.gov.cn/searchList.jspx',
            'validateCode':
            'http://gxqyxygs.gov.cn/validateCode.jspx?type=0&id=0.6145392225593206'
        }

        self.search_dict = {
            'eareName': 'http://gxqyxygs.gov.cn',
            'search': 'http://222.143.24.157/search.jspx',
            'validateCode':
            'http://222.143.24.157/validateCode.jspx?type=0&id=0.8720359673599201',
            'searchList': 'http://222.143.24.157/searchList.jspx',
            'businessPublicity':
            'http://222.143.24.157/businessPublicity.jspx?',
            'enterprisePublicity':
            'http://222.143.24.157/enterprisePublicity.jspx?',
            'otherDepartment': 'http://222.143.24.157/otherDepartment.jspx?',
            'justiceAssistance':
            'http://222.143.24.157/justiceAssistance.jspx?',
            'next_head': 'http://gxqyxygs.gov.cn/Query'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东（发起人）信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东（发起人）及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息（币种与注册资本一致）': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}