def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"获取验证码") module.module_id = "init_validate_code" module.appendUrl( "http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0&id=" + str(random.random())) module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.addSleep(Sleep(3)) module.appendEncoding("utf-8") def checkValidatecode(yzm): if not yzm: self.holder.logging.warning(u"获取验证码失败") return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode)) self.module_manager.appendSubModule(module, True)
def initToken(self): module = Module(self.getWebHtml, u"令牌获取") module.module_id = "module_token" module.appendUrl('https://www.sgs.gov.cn/notice/search/popup_captcha') module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.sgs.gov.cn/notice/home' }) module.appendEncoding("utf-8") def getToken(html): if not html or '\"session.token\": \"' not in html: self.holder.logging.error(u'获取session.token失败!') return None token = re.search(r'\"session\.token\": \"(.*?)\"', html).group(1) if not token: self.holder.logging.error(u'提取token失败!') self.holder.logging.info('token: %s' % token) return token module.appendOutput(name="token", type=OutputType.FUNCTION, function=getToken) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) self.module_manager.appendSubModule(module)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"获取验证码") module.module_id = "init_validate_code" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/verify.html?random=" + str(random.random())) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Accept': 'image/webp,*/*;q=0.8', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.addSleep(Sleep(3)) module.appendEncoding("utf-8") def checkValidatecode(yzm): if not yzm: return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode)) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, textfield: { "code": yzm, "textfield": textfield.replace(r"\n", "") }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def checkValidateCode(self): module = Module(self.getJson, u"检验验证码") module.module_id = "check_validate_code" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'http://www.nmgs.gov.cn:7001', 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.addSleep(Sleep(3)) module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "code": yzm, "textfield": company_key }) def checkValidatecode(web=None): if not web: return False else: pattern = re.compile(r'\"([\s\S]*?)\"') flags = pattern.findall(str(web.body)) if (len(flags) != 4 or flags[2] != 'textfield') or ( flags[0] == 'flag' and flags[1] != str(1)): self.holder.logging.warning(u"验证码校验失败!") return False else: self.value_dict["textfield"] = flags[3].decode( 'raw_unicode_escape') return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): """ 抓取公司列表 :output: url_list, name_list, search_list :return: """ module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://218.95.241.36/searchList.jspx") module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Origin": "http://218.95.241.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://218.95.241.36/search.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def getStockholderInfo(self, module_super): """ 抓取翻页的股东信息 :param module_super: :return: """ module = Module(self.visitGdxx, u"抓取股东信息") module.module_id = "get_stockholder_info" module.appendUrl( lambda pno, company_id: "http://218.95.241.36/QueryInvList.jspx?pno=%s&mainId=%s" % (pno, company_id)) module.appendHeaders( lambda company_id: { 'Host': '218.95.241.36', 'Proxy-Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://218.95.241.36/businessPublicity.jspx?id=' + str( company_id), 'Accept-Encoding': 'gzip, deflate', "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getGdxqList(html): query_dict = dict() try: tree = etree.HTML(html) query_dict["gdxq_list"] = tree.xpath( ".//*[@class='detailsList']/tr/td/a/@onclick") except Exception as e: self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e) query_dict = dict() return query_dict module.appendOutput(type=OutputType.FUNCTION, function=getGdxqList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module_super.appendSubModule(module, True)
def getAnnalsInfo(self, module_super): """ 遍历年报列表 :param module_super: :return: """ iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://218.95.241.36' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def getStockholderInfo(self, module_super): """ 抓取翻页的股东信息 :param module_super: :return: """ module = Module(self.visitGdxx, u"抓取股东信息") module.module_id = "get_stockholder_info" module.appendUrl( lambda pno, company_id: "http://gsxt.hljaic.gov.cn/QueryInvList.jspx?pno=%s&mainId=%s" % (pno, company_id)) module.appendHeaders( lambda company_id: { "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", 'Referer': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str( company_id), }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getGdxqList(html): query_dict = dict() try: tree = etree.HTML(html) query_dict["gdxq_list"] = tree.xpath( ".//*[@class='detailsList']/tr/td/a/@onclick") except Exception as e: self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e) query_dict = dict() return query_dict module.appendOutput(type=OutputType.FUNCTION, function=getGdxqList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module_super.appendSubModule(module, True)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://gsxt.hljaic.gov.cn/searchList.jspx") module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/plain, */*; q=0.01", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def getAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://gsxt.hljaic.gov.cn' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def checkValidateCode(self): """ 对上一个模块产生的验证码进行校验 :return: """ module = Module(self.getJson, u"校验验证码") module.module_id = "check_validate_code" module.appendUrl('http://218.95.241.36/checkCheckNo.jspx') module.appendHeaders({ "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Proxy-Connection": "keep-alive", "Host": "218.95.241.36", "Origin": "http://218.95.241.36", "Referer": "http://218.95.241.36/search.jspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm: {"checkNo": yzm}) def checkValidatecode(json=None): if not json or "{success:true}" not in json: self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码") return False else: return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def fetchCmpnyGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") # TODO: 添加try exception def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def getCompanyRecordInfo(self, module_super): module = Module(self.visitBaxx, u"抓取备案信息") module.module_id = "get_record_info" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entCheckInfo" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entInfo", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) module.appendWebMethod("post") module.appendPostData( lambda params_list: { "entNo": str(params_list[1]), "entType": str(params_list[2]), "regOrg": str(params_list[3]) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module_super.appendSubModule(module, True)
def initConfigValidateCode(self): """ 获取验证码并检查是否产生验证码结果 :return: """ module = Module(self.visitValidateCode, u"获取验证码") module.module_id = "init_validate_code" module.appendUrl('http://218.95.241.36/validateCode.jspx?type=0') module.appendHeaders({ "Accept": "image/webp,image/*,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Proxy-Connection": "keep-alive", "Host": '218.95.241.36', "Referer": "http://218.95.241.36/search.jspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" }) module.addSleep(Sleep(3)) module.appendEncoding("utf-8") def checkYZM(yzm=None): if not yzm: self.holder.logging.warning(u"获取验证码失败") return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkYZM)) self.module_manager.appendSubModule(module, True)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { 'Host': '218.95.241.36', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*', 'Referer': 'http://218.95.241.36/businessPublicity.jspx?id=' + str( company_id), 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate, sdch', "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return "http://218.95.241.36" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, u"基本信息") module.appendUrl("company_url") module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.sgs.gov.cn/notice/search/ent_info_list' }) module.appendEncoding("utf-8") module.appendOutput(name="gdxq_list", xpath=".//*[@id='investorTable']//td/a/@href", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module_super.appendSubModule(module, True)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", 'Referer': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str( company_id), }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def checkValidateCode(self): module = Module(self.getJson, u"校验验证码") module.module_id = "check_validate_code" module.appendUrl('http://gsxt.hljaic.gov.cn/checkCheckNo.jspx') module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm: {"checkNo": yzm}) def checkValidatecode(json=None): if not json or "{success:true}" not in json: self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码") return False else: return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def getAnnalsList(self, module_super): """ 抓取年报列表 :param module_super: :return: """ module = Module(self.getWebHtml, u"抓取年报列表") module.module_id = "get_annals_list" def prepareParams(company_url): query_dict = {} if company_url: query_dict["url_id"] = company_url.split("=")[1] return query_dict module.appendInput(InputType.FUNCTION, prepareParams) def getURL(url_id=None): if url_id: return u'http://218.95.241.36/enterprisePublicity.jspx?id=' + url_id return None module.appendUrl(getURL) module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getAnnalsList(html=None): qynb_list = [] try: tree = etree.HTML(html) _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a") for ll in _list: url = ''.join(ll.xpath('@href')).strip() name = ''.join(ll.xpath('text()')).replace(u'年度报告', '') if name != u'详情': qynb_list.append([url, name]) except Exception as e: self.holder.logging.warning(u"获取annals_list失败: %s" % e) qynb_list = [] return qynb_list module.appendOutput(name='annals_list', type=OutputType.FUNCTION, function=getAnnalsList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module_super.appendSubModule(module, True)
def getAnnalsList(self, module_super): module = Module(self.getWebHtml, u"抓取年报列表") module.module_id = "get_annals_list" def prepareParams(company_url): query_dict = {} if company_url: query_dict["url_id"] = company_url.split("=")[1] return query_dict module.appendInput(InputType.FUNCTION, prepareParams) def getURL(url_id=None): if url_id: return u'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=' + url_id return None module.appendUrl(getURL) module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getAnnalsList(html=None): qynb_list = [] try: tree = etree.HTML(html) _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a") for ll in _list: url = ''.join(ll.xpath('@href')).strip() name = ''.join(ll.xpath('text()')).replace(u'年度报告', '') if name != u'详情': qynb_list.append([url, name]) except: qynb_list = [] return qynb_list module.appendOutput(name='annals_list', type=OutputType.FUNCTION, function=getAnnalsList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_list")) module_super.appendSubModule(module, True)
def getCompanyInfo(self, module_super): module = Module(self.visitJbxx, u"抓取公司信息") module.module_id = "get_cmpny_info" def getURL(company_url): if "http" in company_url: return company_url else: return u"http://www.nmgs.gov.cn:7001/aiccips" + company_url.replace( '..', '') module.appendUrl(getURL) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) module.appendOutput(name="gdxq_list", xpath=".//*[@id='invInfo']/table/tr/td/a/@onclick", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput(name="params_list", xpath='.//input[@type=\'hidden\']/@value', type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) module_super.appendSubModule(module, True) def checkParmsList(params_list=None): if params_list: return False return True module.appendBypass( Bypass(condition_fuc=checkParmsList, module_id="get_record_info", range_global=True)) module.appendBypass( Bypass(condition_fuc=checkParmsList, module_id="get_punish_info", range_global=True))
def getCompanyInfo(self, module_super): """ 抓取公司信息, 并获取各个模块的翻页总页码 :param module_super: :output: company_id, gdxx_page_range, bgxx_page_range, baxx_page_range, fzjg_page_range, xzcf_page_range """ module = Module(self.visitJbxx, u"抓取公司信息") module.module_id = "get_cmpny_info" def getURL(company_url=None): if "http" in company_url: return company_url else: return u'http://218.95.241.36' + company_url module.appendUrl(getURL) module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getCompanyID(company_url): query_dict = dict() try: query_dict["company_id"] = re.search( r'\?id\=([\s\S]*)', company_url).group().split("=")[1] except Exception as e: self.holder.logging.warning(u"获取company_id失败: %s" % e) query_dict = dict() return query_dict module.appendOutput(name="gdxq_list", xpath=".//*[@id='invDiv']/table/tr/td/a/@onclick", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput(type=OutputType.FUNCTION, function=getCompanyID) # 股东信息页码获取 def getGdxxPageno(html): try: fenye_xpath = ".//div[@id='invDiv']/following-sibling::table[1]|.//div[@id='invPagination']/table[1]" gdxx_tree = etree.HTML(html) fenye_table = gdxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_total_pg(fenye_table) if pageno <= 1: return [] self.holder.logging.info(u"------------------股东信息页码: " + str(pageno) + u"---------------------") return range(2, int(pageno) + 1) except Exception as e: self.holder.logging.warning(u"获取股东信息页码失败: %s" % e) return [] module.appendOutput(name="gdxx_page_range", type=OutputType.FUNCTION, function=getGdxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) # 变更信息页码获取 def getBgxxPageno(html): try: fenye_xpath = ".//div[@id='altDiv']/following-sibling::table[1]|.//div[@id='altPagination']/table[1]" bgxx_tree = etree.HTML(html) fenye_table = bgxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_total_pg(fenye_table) if pageno <= 1: return [] self.holder.logging.info(u"------------------变更信息页码: " + str(pageno) + u"---------------------") return range(2, int(pageno) + 1) except Exception as e: self.holder.logging.warning(u"获取变更信息页码失败: %s" % e) return [] module.appendOutput(name="bgxx_page_range", type=OutputType.FUNCTION, function=getBgxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) # 备案信息页码获取 def getBaxxPageno(html): try: fenye_xpath = ".//div[@id='memDiv']/following-sibling::table[1]|.//*[@id='beian']/table[2]" baxx_tree = etree.HTML(html) fenye_table = baxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_total_pg(fenye_table) if pageno <= 1: return [] self.holder.logging.info(u"------------------备案信息页码: " + str(pageno) + u"---------------------") return range(2, int(pageno) + 1) except Exception as e: self.holder.logging.warning(u"获取备案信息页码失败: %s" % e) return [] module.appendOutput(name="baxx_page_range", type=OutputType.FUNCTION, function=getBaxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) # 分支机构页码获取 def getFzjgPageno(html): try: fenye_xpath = ".//div[@id='childPagination']/table[1]|.//div[@id='childDiv']/following-sibling::table[1]" fzfg_tree = etree.HTML(html) fenye_table = fzfg_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_total_pg(fenye_table) if pageno <= 1: return [] self.holder.logging.info(u"------------------分支机构信息页码:" + str(pageno) + u"---------------------") return range(2, int(pageno) + 1) except Exception as e: self.holder.logging.warning(u"获取分支机构页码失败: %s" % e) return [] module.appendOutput(name="fzjg_page_range", type=OutputType.FUNCTION, function=getFzjgPageno, show_up=OutputParameterShowUpType.OPTIONAL) # 行政处罚页码获取 def getXzcfPageno(html): try: fenye_xpath = ".//*[@id='xingzhengchufa']/table[2]" xzcf_tree = etree.HTML(html) fenye_table = xzcf_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_total_pg(fenye_table) if pageno <= 1: return [] self.holder.logging.info(u"------------------行政处罚信息页码:" + str(pageno) + u"---------------------") return range(2, int(pageno) + 1) except Exception as e: self.holder.logging.warning(u"获取行政处罚页码失败: %s" % e) return [] module.appendOutput(name="xzcf_page_range", type=OutputType.FUNCTION, function=getXzcfPageno, show_up=OutputParameterShowUpType.OPTIONAL) def checkCompnayID(company_id=None): if not company_id: self.holder.logging.warning(u"company_id无效") return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=10, assert_function=checkCompnayID)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_cmpny_info")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="get_cmpny_info")) module_super.appendSubModule(module, True)