def precheckCmpyKey(self): """ 检查查询条件: 1. 长度 2. 非法字符 :return: """ module = Module(None, u"查询条件检查") def checkCmpyKey(company_key): try: if len(company_key) < 2 or len(company_key) > 60: self.holder.logging.warning(u"查询条件长度不能小于2个字符且不能大于60个字符!") return False if filter(lambda x: company_key.find(x) > 0, [",", "'", '"', "<", ">", ";", "_"]): return False company_key = company_key.replace(u"(", "").replace(u")", "") if not re.match(u"^(\w|[\u4E00-\u9FA5])*$", company_key): self.holder.logging.warning(u"查询条件中含有非法字符!") return False return True except: return False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=2, assert_function=checkCmpyKey)) self.module_manager.appendSubModule(module)
def run(): cfg = ConfigParser() cfg.read(os.environ['HOME'] + '/.caffers.conf') path = cfg.get('installation', 'path') print("[LOG] Begin to run") interface = Interface(path) statistic = Statistic(path, "StatisticFile") module = Module(path, "ModuleFile") diary = Diary(path + "Diary/") res = input("是否打开日记本?[y/n]") if res != 'y': print("放弃本次回答,日记关闭") return False res = input("是否开始写日记?[y/n]") if res == 'y': interface.add_diary() else: print("请选择其他功能:") show_menu() select_item = input("请输入序号:") while select_item != '0': if select_item == '1': question = input("请输入需要添加的问题:") module.add_question(question) res = input("继续添加?[y/n]") while res == 'y': question = input("请输入需要添加的问题:") module.add_question(question) res = input("继续添加?[y/n]") break elif select_item == '2': module.show_all_questions() break elif select_item == '3': item_number = input("请输入要修改的问题序号:") question = input("请输入要修改为的内容:") module.change_question(item_number, question) break elif select_item == '3': recommanded_diary = statistic.get_random_diary() print("推荐日记:" + recommanded_diary) diary.show(recommanded_diary) break else: print("输入序号不合法") select_item = input("请输入序号:") #statistic.update_statistic_file(path + "Diary/") print("[LOG] End of run")
def initAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annal") module = Module(None, u"获取年报信息", iterator) module.module_id = "get_annals_info" module_super.appendSubModule(module, True) self.initAnnalsDetails(module)
def initAnnualReportPre(self, module_super): module = Module(self.getWebHtml, u"获取年报年份列表") module.module_id = "fetch_qynb_list" module.appendUrl( lambda qyid, company_zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/qygs_ViewQynb.pt?qyid=%s&zch=%s&qylx=%s&num=0" % (qyid, company_zch, qylx)) module.appendHeaders( lambda ua, qylx, qyid, company_zch: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.jxaic.gov.cn", "Connection": "keep-alive" }) def getRightUrl(html=None): if not html: return [] rs = re.findall(r'<a\s+href="(.*?\?.*?nbnd=\d{4}.*?)"', html, re.S) rs = list(set(rs)) return ['http://gsxt.jxaic.gov.cn' + x for x in rs] module.appendOutput(name="qynb_param_list", type=OutputType.FUNCTION, function=getRightUrl, show_up=OutputParameterShowUpType.OPTIONAL) module_super.appendSubModule(module, True)
def initUrlParams(self,module_super): module = Module(None, u"公司详情链接参数值提取") def initReady(com): params = {} if com and len(com)>=2: params["company_url"] = com[0] params["search_company"] = com[1] return params module.appendOutput(type=OutputType.FUNCTION, function=initReady) module_super.appendSubModule(module, True)
def prepareCompnanyParms(self, module_super): module = Module(None, u"抓取公司前的预处理") def prepareParams(com): query_dict = {} if com and len(com) >= 2: query_dict["company_url"] = com[0] query_dict["search_company"] = com[1] return query_dict module.appendOutput(type=OutputType.FUNCTION, function=prepareParams) module_super.appendSubModule(module, True)
def initCompanyInfoPrepare(self, module_super): module = Module(None, "抓取公司前的预处理") def prepare(com): query_ = {} if com and len(com) >= 2: query_["company_url"] = com[0] query_["search_company"] = com[1] return query_ module.appendOutput(type=OutputType.FUNCTION, function=prepare) module_super.appendSubModule(module, True)
def initCompanyInfoPrepare(self, module_super): module = Module(None, u"抓取公司前的预处理") def prepare(info): query_ = dict() query_["company_url"] = info.xpath('.//a/@href')[0].strip() query_["search_company"] = info.xpath('.//a/text()')[0].strip() #query_['zch'] = info.xpath(".//*[@class='profile']/span[1]/text()")[0].strip() return query_ module.appendOutput(type=OutputType.FUNCTION, function=prepare) module_super.appendSubModule(module, True)
def fetchStockholderInfo(self, module_super): """ 遍历股东信息分页, 并再次迭代股东详情列表 :param module_super: :return: """ iterator = Iterator(seeds="gdxx_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历股东翻页") module.module_id = "gdxx_pages" module_super.appendSubModule(module) self.getStockholderInfo(module) self.fetchStockholderDetail(module)
def initYzmPic(self): module = Module(self.visitValidateCode, u'获取验证码图片') module.appendUrl('http://218.57.139.24/securitycode') module.appendHeaders(lambda ua:{ "Host": "218.57.139.24", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive" }) module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page')) self.module_manager.appendSubModule(module, True)
def initNianBao(self, module_super): module = Module(self.getWebHtml, u"抓取公司的年报信息") module.appendUrl( lambda company_url: company_url.replace('tab=01', 'tab=02')) module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }) def xpaths(html): tree = etree.HTML(html) _list = tree.xpath('.//*[@class="info m-bottom m-top"]/tr/td/a') qynb_list = [] for ll in _list: url = ''.join(ll.xpath('@href')).strip() name = ''.join(ll.xpath('text()')).replace(u'年度报告', '') if name != u'详情': qynb_list.append([url, name]) return qynb_list module.appendOutput(name='qynb_list', type=OutputType.FUNCTION, function=xpaths, show_up=OutputParameterShowUpType.OPTIONAL) module_super.appendSubModule(module, True)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, "基本信息") def prepare(company_url): query_ = {} for qq in map(lambda x: x.split("="), urlparse.urlparse(company_url).query.split("&")): query_[qq[0]] = qq[1] print query_ return query_ module.appendInput(InputType.FUNCTION, prepare) module.appendUrl( lambda qyid, zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxx.pt?qyid=%s&zch=%s&qylx=%s&num=undefined&showgdxx=true" % (qyid, zch, qylx)) module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn' }) module_super.appendSubModule(module, True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, "验证码") module.module_id = "module_validate_code" module.appendUrl("http://gsxt.gdgs.gov.cn/aiccips/verify.html?random=" + str(random.random())) module.appendHeaders( {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0', 'Host': 'gsxt.gdgs.gov.cn', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/'}) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) self.module_manager.appendSubModule(module)
def fetchCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"处理公司列表", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.prepareCompnanyParms(module) self.getCompanyInfo(module) self.fetchCmpnyGdxq(module) self.getCompanyRecordInfo(module) self.getCompanyPunishInfo(module) # self.getAnnalsList(module) # self.getAnnalsInfo(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"获取公司信息", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.initConfigBaseInfo(module) self.initConfigShareHolderInfo(module) self.initConfigChangeInfo(module) self.initArchiveInfo(module) self.initBranchInfo(module) self.initPenaltyInfo(module) self.initAnnalsList(module) self.initAnnalsInfo(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "info") module = Module(None, "获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyPrepare(module) self.initRouter(module)
def initShareholderInfoDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, u"开始获取股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情信息") def getGdxqUrl(gdxx_rcd): for key in gdxx_rcd: if 'onclick' in gdxx_rcd[key]: onclick_dict = eval(gdxx_rcd[key]) if isinstance(gdxx_rcd[key], basestring) else gdxx_rcd[key] onclick = onclick_dict["onclick"] xq_link = onclick[onclick.find('(')+1:onclick.find(')')].replace("'", "") xq_url = "http://aic.hainan.gov.cn:1888" + xq_link return xq_url return None sub_module.appendUrl(getGdxqUrl) sub_module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initPenaltyInfo(self, module_super): module = Module(self.visitXzcf, u"获取行政处罚信息") module.appendUrl( lambda params: 'http://211.141.74.198:8081/aiccips/pub/gsxzcfxx') module.appendHeaders( lambda csrf: { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '211.141.74.198:8081', 'X-CSRF-TOKEN': csrf[-1] }) module.appendWebMethod("post") module.appendPostData(lambda params: {'encrpripid': params[0]}) module_super.appendSubModule(module)
def initShareHolderDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, "进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, "获取股东详情信息") sub_module.appendUrl(self.getGdxqUrl) sub_module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn', 'Cache-Control': 'max-age=0' }) module.appendSubModule(sub_module)
def initGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") sub_module.appendUrl("gdxq") module.appendHeaders( lambda gdxq: { 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': gdxq }) module.appendSubModule(sub_module)
def initAnnualReportList(self, module_super): module = Module(self.visitQynbList, u"第九步_获取企业年报列表") module.appendUrl(lambda id: "http://aic.hainan.gov.cn:1888/enterprisePublicity.jspx?id=%s" % id) module.appendHeaders( lambda company_url: { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "aic.hainan.gov.cn:1888", "Referer": company_url, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } ) module.appendOutput(name="nb_list", xpath=".//*[@id='qiyenianbao']/table//td/a", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.addSleep(Sleep(2)) module_super.appendSubModule(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "info") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyInfoPrepare(module) self.initConfigBaseInfo(module) self.initGdxq(module) self.initNianBao(module) self.initNbiter(module) self.initResultCollect(module)
def initCompanyPrepare(self, module_super): module = Module(name="抓取公司前的预处理") def company_info_prepare(info): try: company_list_name = info.xpath('.//a/text()')[0].strip() except Exception as e: company_list_name = '' company_url = '' company_url_list = info.xpath(".//a") if company_url_list: if isinstance(company_url_list, list): company_url = company_url_list[0].get("href") if '../GSpublicity/' in company_url: company_url = 'http://gsxt.gdgs.gov.cn/aiccips' + company_url[2:] return {"company_url": company_url, "company_name": company_list_name, "search_company": company_list_name} module.appendOutput(type=OutputType.FUNCTION, function=company_info_prepare) module_super.appendSubModule(module)
def initConfigCompanyInfo(self): iterator = Iterator("tag_alist", "tag_a") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initConfigCompanyInfoPre(module) self.initConfigBaseInfo(module) self.initArchiveInfo(module) self.initPenaltyInfo(module) self.initShareHolderDetail(module) self.initAnnualReportPre(module) self.initAnnualReport(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyInfo(module) self.initBaxxInfo(module) self.initFzhgInfo(module) self.initXzcfxxInfo(module) self.initGdxqInfoPrepare(module) self.initAnnualReportPre(module) self.initAnnualReport(module) self.initResultCollect(module)
def initPenaltyInfo(self, module_super): module = Module(self.visitXzcf, u"获取行政处罚信息") module.appendUrl( lambda qyid, company_zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewXzcfxx.pt?qyid=%s&zch=%s&qylx=%s&num=1&showgdxx=true" % (qyid, company_zch, qylx)) module.appendHeaders( lambda ua, qylx, qyid, company_zch: { 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': ua, 'Referer': 'http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/ccjcgs_ccjcgsIndexDetail.pt?qylx=%s&qyid=%s&zch=%s&tabName=1' % (qylx, qyid, company_zch), 'Host': 'gsxt.jxaic.gov.cn' }) module_super.appendSubModule(module, True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"验证码") module.module_id = "module_validate_code" module.appendUrl( 'http://gsxt.jxaic.gov.cn/ECPS/common/common_getJjYzmImg.pt?yzmName=searchYzm&imgWidth=180&t=' + str(random.random())) module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn', 'Referer': 'http://gsxt.jxaic.gov.cn/' }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) self.module_manager.appendSubModule(module, True)
def fetchCompanyInfo(self): """ 创建模块分支, 迭代公司列表 :return: """ iterator = Iterator("search_list", "com") module = Module(None, u"处理公司列表", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.prepareCompnanyParms(module) self.getCompanyInfo(module) self.fetchStockholderDetail(module) self.fetchStockholderInfo(module) self.fetchChangeInfo(module) self.fetchRecordInfo(module) self.fetchBranchInfo(module) self.fetchPunishInfo(module) self.getAnnalsList(module) self.getAnnalsInfo(module) self.initResultCollect(module)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, u"基本信息") module.appendUrl("company_url") module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.sgs.gov.cn/notice/search/ent_info_list' }) module.appendEncoding("utf-8") module.appendOutput(name="gdxq_list", xpath=".//*[@id='investorTable']//td/a/@href", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module_super.appendSubModule(module, True)
def initConfigYzm(self): module = Module(self.visitValidateCode, u"验证码") module.module_id = "module_yzm" module.appendUrl( lambda radom_val: "http://xyjg.egs.gov.cn/ECPS_HB/validateCode.jspx?type=1&_=%s" % (str(int(random.random())))) module.appendHeaders( lambda ua: { "Host": "xyjg.egs.gov.cn", "User-Agent": ua, "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://xyjg.egs.gov.cn/ECPS_HB/search.jspx" }) # Todo 不指定redo module,是不是重试自己?? module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=10000)) # redo_module self.module_manager.appendSubModule(module)