def initBasicInfo(self, module_super): module = Module(self.visitJbxx, u"第四步_获取基本信息") def getparams(company_url): query = {} for quy in map(lambda par: par.split("="), urlparse.urlparse(company_url).query.split("&")): query[quy[0]] = quy[1] print query return query module.appendInput(InputType.FUNCTION, getparams) module.appendUrl(lambda id: "http://aic.hainan.gov.cn:1888/businessPublicity.jspx?id=%s" % id) module.appendHeaders( { "Host": "aic.hainan.gov.cn:1888", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://aic.hainan.gov.cn:1888/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" } ) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True) # 股东信息页码获取 def getGdxxPageno(html): fenye_xpath = ".//div[@id='invDiv']/following-sibling::table[1]|.//div[@id='invDiv']/following-sibling::div[1]" gdxx_tree = etree.HTML(html) fenye_table = gdxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] self.holder.logging.info("------------------股东信息页码:" + str(pageno) + "---------------------") return range(2, int(pageno) + 1) module.appendOutput(name="gdxx_page_range", type=OutputType.FUNCTION, function=getGdxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_gdxx(gdxx_page_range=None): if not gdxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_gdxx, module_id="gdxx_pages")) # 变更信息页码获取 def getBgxxPageno(html): fenye_xpath = ".//div[@id='altPagination']/table[1]|.//div[@id='altDiv']/following-sibling::table[1]" bgxx_tree = etree.HTML(html) fenye_table = bgxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="bgxx_page_range", type=OutputType.FUNCTION, function=getBgxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_bgxx(bgxx_page_range=None): if not bgxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_bgxx,module_id="bgxx_pages")) # 备案信息页码提取 def getbaxxPageno(html): fenye_xpath = ".//div[@id='memDiv']/following-sibling::table[1]|.//*[@id='beian']/table[2]" baxx_tree = etree.HTML(html) fenye_table = baxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="baxx_page_range",type=OutputType.FUNCTION,function=getbaxxPageno,show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_baxx(baxx_page_range=None): if not baxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_baxx,module_id="baxx_pages")) # 备案信息url区分,如果没有备案信息数据则不请求分页网址 # def getbaxx_url(html): # data_xpath = ".//div[@id='memDiv']" # tree = etree.HTML(html) # data_trs = tree.xpath(data_xpath) # if not data_trs: # return True # else: # return False # module.appendBypass(Bypass(condition_fuc=getbaxx_url,module_id="baxx_pages")) # 分支机构页码提取 def getfzjgPageno(html): fenye_xpath = ".//div[@id='childPagination']/table[1]|.//div[@id='childDiv']/following-sibling::table[1]" fzjg_tree = etree.HTML(html) fenye_table = fzjg_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="fzjg_page_range",type=OutputType.FUNCTION,function=getfzjgPageno,show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_fzjg(fzjg_page_range=None): if not fzjg_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_fzjg,module_id="fzjg_pages"))
def getCompanyInfo(self, module_super): module = Module(self.visitJbxx, u"抓取公司信息") module.module_id = "get_cmpny_info" def getURL(company_url): if "http" in company_url: return company_url else: return u"http://www.nmgs.gov.cn:7001/aiccips" + company_url.replace( '..', '') module.appendUrl(getURL) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) module.appendOutput(name="gdxq_list", xpath=".//*[@id='invInfo']/table/tr/td/a/@onclick", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput(name="params_list", xpath='.//input[@type=\'hidden\']/@value', type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) module_super.appendSubModule(module, True) def checkParmsList(params_list=None): if params_list: return False return True module.appendBypass( Bypass(condition_fuc=checkParmsList, module_id="get_record_info", range_global=True)) module.appendBypass( Bypass(condition_fuc=checkParmsList, module_id="get_punish_info", range_global=True))