def get_index(code): province = deal_html_code.judge_province(code) first_url = config.url_list[province].format(code) result,status_code = Send_Request().send_requests(first_url) data = None if status_code ==200: info = json.loads(result)["info"][0] uuid = info["uuid"] second_url = config.detail_list[province].format(uuid) data = Send_Request().send_requests(second_url)[0] return data
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.xpath(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class = viewBox']//dl")[0] info = {} if "企业名称" in content: datallist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) datallist.remove(datallist[-1]) pattern = re.compile(u".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(datallist, info, j) else: j = 0 deal_single_info(datallist, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) href = out_invest_url.format(entid, cid) for k in xrange(2, totalpage + 1): content, status_code = Send_Request().send_request(href) if status_code == 200: start = k * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] datalist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) if len(datalist) > 0: datalist.remove(datalist[-1]) deal_single_info(datalist, info, start) else: pass else: flag = 100000004 else: flag = 100000004 if flag == 1: info = deal_html_code.remove_repeat(info) return info, flag
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) # print content info = {} if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] datalist = etree.tostring(dl).split( '<dt style="color:#333;margin-bottom:10px;"/>') datalist.remove(datalist[0]) if len(datalist) > 0: pattern = re.compile(".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(datalist, info, j) else: j = 0 deal_single_info(datalist, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) for k in xrange(2, totalpage + 1): href = share_url.format(entid, cid, k) content, status_code = Send_Request().send_request( href, headers) if status_code == 200: start = k * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] datalist = etree.tostring(dl).split( '<dt style="color:#333;margin-bottom:10px;"/>') datalist.remove(datalist[0]) if len(datalist) > 0: deal_single_info(datalist, info, start) else: pass else: logging.info("无股东及出资信息") else: flag = 100000004 info = deal_html_code.remove_repeat(info) return info, flag
def get_info(self): headers = config.headers url = self._url.format(self._pripid) result, status_code = Send_Request().send_requests(url, headers) info = {} if status_code == 200: data = etree.xpath(result, parser=etree.HTMLParser(encoding='utf-8')) tr_list = data.xpath( "//table[id= 'table_jyyc']//tr[@name = 'jyyc']") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") temp["types"] = '经营异常' temp["in_reason"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) in_date = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) temp["in_date"] = deal_html_code.change_chinese_date(in_date) temp["out_reason"] = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) out_date = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) temp["out_date"] = deal_html_code.change_chinese_date(out_date) temp["gov_dept"] = deal_html_code.remove_symbol( td_list[6].xpath("string(.)")) temp["out_gov"] = deal_html_code.remove_symbol( td_list[7].xpath("string(.)")) info[i] = temp return info
def get_year_href(self): information = {} result, status_code = Send_Request().send_requests(self.url) # print result pattern = re.compile('.*/index/invalidLink.*|.*页面不存在.*') fail = re.findall(pattern, result) flag = 1 if status_code == 200 and len(fail) == 0: pattern = re.compile(r'\[(.*?)\]') result = re.findall(pattern, result)[0] pattern = re.compile(u'{.*?}') result = re.findall(pattern, result) for i in xrange(len(result)): singledata = json.loads(result[i]) anCheId = singledata["anCheId"] anCheYear = singledata["anCheYear"] entType = singledata["entType"] annRepFrom = singledata["annRepFrom"] province = anCheId[15:17] province = config.province[province] information[i] = [ anCheId, anCheYear, province, entType, annRepFrom ] else: flag = 100000004 logging.info("report url fail") return information, flag
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] dlcontent = etree.tostring(dl) string = '<dd style="border-top:1px dashed #ccc;">' dllist = dlcontent.split(string) dllist.remove(dllist[-1]) for i, single in enumerate(dllist): single = etree.HTML(single, parser=etree.HTMLParser(encoding='utf-8')) # string = u"主体名称" # name = self.deal_dd_content(string,single) string = u"抽查检查日期" check_date = self.deal_dd_content(string, single) string = u"检查实施机关" gov_dept = self.deal_dd_content(string, single) string = u"抽查检查结果" result = self.deal_dd_content(string, single) if u"抽查信息" in url: types = "抽查" elif u"检查信息" in url: types = "检查" else: pass info[i] = [check_date, gov_dept, result, types] else: flag = 100000004 # print info,flag return info, flag
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding="utf-8")) dlinfo = result.xpath("//div[@class ='viewBox']//dl")[0] dl = etree.tostring(dlinfo).split("<br/>") # 将最后一项的无用数据移除 dl.remove(dl[-1]) for i, single in enumerate(dl): single = etree.HTML(single, parser=etree.HTMLParser(encoding="utf-8")) name = deal_html_code.remove_symbol( single.xpath(".//dt")[0].xpath("string(.)")) templist = single.xpath('.//dd') types = deal_html_code.remove_symbol( templist[0].xpath("string(.)")) license_type = deal_html_code.remove_symbol( templist[1].xpath('string(.)')) license_code = deal_html_code.remove_symbol( templist[2].xpath('string(.)')) info[i] = [name, types, license_type, license_code] else: flag = 100000004 return info, flag
def update_all_info(url, gs_basic_id): result, status_code = Send_Request().send_requests(url) pattern = re.compile(".*返回首页.*") fail = re.findall(pattern, result) if status_code == 200 and len(fail) == 0: urllist,flag = QGGS_basic.main(url,gs_basic_id) if flag <100000001: QGGS_black.main(gs_basic_id, urllist["black"]) QGGS_branch.main(gs_basic_id, urllist["branch"]) QGGS_brand.main(gs_basic_id, urllist["brand"]) QGGS_change.main(gs_basic_id, urllist["change"]) QGGS_change.main(gs_basic_id, urllist["change2"]) QGGS_check.main(gs_basic_id, urllist["check"]) QGGS_clear.main(gs_basic_id, urllist["clear"]) QGGS_except.main(gs_basic_id, urllist["except"]) QGGS_freeze.main(gs_basic_id, urllist["freeze"]) QGGS_mort.main(gs_basic_id, urllist["mort"]) QGGS_permit.main(gs_basic_id, urllist["permit"]) QGGS_permit2.main(gs_basic_id, urllist["permit2"]) QGGS_person.main(gs_basic_id, urllist["person"]) QGGS_punish.main(gs_basic_id, urllist["punish"]) QGGS_punish2.main(gs_basic_id, urllist["punish2"]) QGGS_shareholder.main(gs_basic_id, urllist["shareholder"]) QGGS_stock.main(gs_basic_id, urllist["stock"]) QGGS_report.main(gs_basic_id, urllist["report"]) else: logging.error('基本信息页访问失败!') else: logging.error('网页打开过程出错!')
def name(url): result, status_code = Send_Request().send_requests(url) info = {} if status_code == 200: data = json.loads(result)["data"] if len(data) > 0: data = data[0] info = {} uuid = data["soseId"] if_owe = int(data["unpaidSocialInsDis"]) if_basenum = int(data["totalWagesDis"]) if_periodamount = int(data["totalPaymentDis"]) birth_owe = data["unpaidSocialInsSo510"] birth_num = data["so510"] birth = data["totalPaymentSo510"] birth_base = data["totalWagesSo510"] old_num = data["so110"] old_owe = data["unpaidSocialInsSo110"] old = data["totalPaymentSo110"] old_base = data["totalWagesSo110"] unemploy = data["totalPaymentSo210"] unemploy_base = data["totalWagesSo210"] unemploy_owe = data["unpaidSocialInsSo210"] unemploy_num = data["so210"] medical = data["totalPaymentSo310"] medical_base = data["totalWagesSo310"] medical_owe = data["unpaidSocialInsSo310"] medical_num = data["so310"] injury = data["totalPaymentSo410"] injury_owe = data["unpaidSocialInsSo410"] injury_num = data["so410"] info[0] = [uuid,if_owe,if_basenum,if_periodamount,birth_owe,birth_num,birth,birth_base,old_num,old_owe,old,old_base,\ unemploy,unemploy_base,unemploy_owe,unemploy_num,medical,medical_base,medical_owe,medical_num,injury,injury_owe,injury_num] return info
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) info = {} if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.path("//div[@class= 'viewBox']/dl")[0] datallist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) datallist.remove(datallist[-1]) for i, single in enumerate(datallist): single = etree.xpath(content, parser=etree.HTMLParser(encoding='utf-8')) string = u"股东" name = deal_dd_content(string, single) string = u"变更前" percent_pre = deal_dd_content(string, single) string = u"变更后" percent_after = deal_dd_content(string, single) string = u"变更日期" dates = deal_dd_content(string, single) info[i] = [name, percent_pre, percent_after, dates] else: flag = 100000004 if flag == 1: deal_html_code.remove_repeat(info) return info, flag
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath('//div[@class= "viewBox"]//dl')[0] datalist = etree.tostring(dl).replace("<dl>", '').replace( "</dl>", '').split('<dd style="border-top:1px dashed #ccc;">') datalist.remove(datalist[-1]) for i, single in enumerate(datalist): single = etree.HTML(single, parser=etree.HTMLParser(encoding='utf-8')) string = u'商标名称' ia_name = self.deal_dd_content(string, single) string = u'商标注册号' ia_zch = self.deal_dd_content(string, single) string = u'认定类别' ia_type = self.deal_dd_content(string, single) info[i] = [ia_name, ia_zch, ia_type] # print info else: flag = 100000004 return info, flag
def get_info(self): info = {} name = urllib.quote(self._name) url = self._url.format(name, 1) result, status_code = Send_Request().send_requests(url, config.headers) if status_code == 200: start = 0 data = etree.HTML(result, parser=etree.HTMLParser(encoding='utf-8')) self.deal_single_page(info, data, start) totalpage = data.xpath("//input[@id = 'totalPage_sbxx']/@value")[0] for i in xrange(2, int(totalpage) + 1): start = (i - 1) * 6 # 定义开始位置 url = self._url.format(name, i) result, start_code = Send_Request().send_requests(url, config.headers) data = etree.HTML(result, parser=etree.HTMLParser(encoding='utf-8')) self.deal_single_page(info, data, start)
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding="utf-8")) dl = result.xpath("//div[@class = 'viewBox']//dl")[0] dlcontent = etree.tostring(dl) string = '<dd style="border-top:1px dashed #ccc;">' dllist = dlcontent.split(string) dllist.remove(dllist[-1]) for i, single in enumerate(dllist): single = etree.HTML(single, parser=etree.HTMLParser(encoding="utf-8")) string = u"主体名称" name = self.deal_dd_content(string, single) string = u"行政处罚决定书文号" number = self.deal_dd_content(string, single) string = u"处罚事由" types = self.deal_dd_content(string, single) string = u"处罚依据" basis = self.deal_dd_content(string, single) string = u"处罚结果" result = self.deal_dd_content(string, single) # print result string = u"处罚决定日期" date = self.deal_dd_content(string, single) string = u"处罚机构" gov_dept = self.deal_dd_content(string, single) info[i] = [name, number, types, basis, result, date, gov_dept] else: flag = 100000004 return info, flag
def name(self, url): info = {} headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) # total = result.xpath("//table[@id='tableIdStyle']//div/text()")[0] # pattern = re.compile(u".*记录总数(.*?)条.*") # number = re.findall(pattern,total) # if len(number)==1: # temp =int(number[0]) trlist = result.xpath("//table[@id = 'tableIdStyle']//tr") for i, single in enumerate(trlist): tdlist = single.xpath("./td") if len(tdlist) == 0 or len(tdlist) < 4: pass else: name = deal_html_code.remove_symbol( tdlist[1].xpath("string(.)")) code = deal_html_code.remove_symbol( tdlist[2].xpath("string(.)")) gov_dept = deal_html_code.remove_symbol( tdlist[5].xpath("string(.)")) info[i] = [name, code, gov_dept] else: flag = 100000004 return info, flag
def get_report_branch_href(self, url, cookies): branch_list = {} content, status_code = Send_Request().send_request3( url, cookies, headers) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) div_list = result.xpath("//div[@class='categ_info_title_wz']") for i, single in enumerate(div_list): href = config.host + single.xpath('./a/@href')[0] text = single.xpath('./a/text()')[0] if '企业基本信息' in text: branch_list["basic"] = str(href) elif "股东及出资信息" in text: branch_list["share"] = str(href) elif "对外投资信息" in text: branch_list["invest"] = str(href) elif "企业资产状况信息" in text: branch_list["run"] = str(href) elif "生产经营情况" in text: branch_list["run"] = str(href) elif "担保信息" in text: branch_list["assure"] = str(href) elif "股权变更信息" in text: branch_list["schange"] = str(href) elif "网站或网店信息" in text: branch_list["web"] = str(href) else: flag = 100000004 return branch_list, flag
def name(self, url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) info = {} if status_code == 200: # print content flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] datalsit = deal_html_code.remove_space(etree.tostring(dl)).split('<br/>') datalsit.remove(datalsit[-1]) for i, single in enumerate(datalsit): single = etree.HTML(single, parser=etree.HTMLParser(encoding='utf-8')) name = single.xpath("//dt[@style='color:#333;margin-bottom:10px;']/text()") string = u"投资人类型" types = self.deal_dd_content(string, single) string = u"认缴出资金额" reg_amount = self.deal_dd_content(string, single) string = u"认缴出资方式" ra_ways = self.deal_dd_content(string, single) string = u"认缴出资时间" ra_date = self.deal_dd_content(string, single) string = u"实缴出资金额" true_amount = self.deal_dd_content(string, single) string = u"实缴出资方式" ta_ways = self.deal_dd_content(string, single) string = u"实缴出资时间" ta_date = self.deal_dd_content(string, single) info[i] = [name, types, reg_amount, ra_ways, ra_date, true_amount, ta_ways, ta_date] else: flag = 100000004 return info, flag
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] string = u'执行法院' court = self.deal_dd_content(string, dl) string = u'被执行人' executor = self.deal_dd_content(string, dl) string = u'执行文书文号' rule_no = self.deal_dd_content(string, dl) string = u'执行事项' items = self.deal_dd_content(string, dl) string = u'冻结开始日期' start_date = self.deal_dd_content(string, dl) string = u'冻结结束日期' end_date = self.deal_dd_content(string, dl) string = u'公示日期' pub_date = self.deal_dd_content(string, dl) string = u"被执行人持有股权" stock = self.deal_dd_content(string, dl) string = u'被执行人证件种类' cert_cate = self.deal_dd_content(string, dl) string = u'被执行人证件号码' cert_code = self.deal_dd_content(string, dl) string = u"解冻日期" end_freeze = self.deal_dd_content(string, dl) info[0] = [court, executor, rule_no, items, start_date, end_date, pub_date, stock, cert_cate, cert_code, end_freeze] else: flag = 100000004 # print info,flag return info, flag
def deal_detail_content(detail_url): # print detail_url detail_code, status_code = Send_Request().send_requests(detail_url) if status_code == 200: detail_code = json.loads(detail_code)["data"] if len(detail_code[1]) != 0: content1 = detail_code[1][0] elif len(detail_code[0]) != 0: content1 = detail_code[0][0] if len(content1) != 0: if "conDate" in content1.keys(): ra_date = content1["conDate"] ra_date = change_date_style(ra_date) ta_date = ra_date else: ta_date = None ra_date = None if "conForm_CN" in content1.keys(): ra_ways = content1["conForm_CN"] ta_ways = ra_ways else: ta_ways = None ra_ways = None if "subConAm" in content1.keys(): reg_amount = content1["subConAm"] else: reg_amount = None if "acConAm" in content1.keys(): true_amount = content1["acConAm"] else: true_amount = None return ra_date, ra_ways, true_amount, reg_amount, ta_ways, ta_date
def get_detail_info(self, detail_url, info): dict = { u"种类": "cates", u"范围": "ranges", u"期限": "period", u"备注": "remark", } headers = config.headers result, status_code = Send_Request().send_requests(detail_url, headers) if status_code == 200: data = etree.xpath(result, parser=etree.HTMLParser(encoding='utf-8')) string = u"被担保债权概况信息" table = data.xpath("//*[contains(.,'%s')]" % string)[0] for key, value in dict.iteritems(): info[value] = deal_html_code.get_match_info(key, table) string = u"抵押权人概况信息" person_info = data.xpath("//*[contains(.,'%s')]" % string)[0] string = u"抵押权物概况信息" goods_info = data.xpath("//*[contains(.,'%s')]" % string)[0] else: info["cates"] = '' info["ranges"] = '' info["period"] = '' info["remark"] = '' person_info = {} goods_info = {} return person_info, goods_info
def get_info(self): url = self._url.format(self._pripid) headers = config.headers result, status_code = Send_Request().send_requests(url, headers=headers) data = etree.HTML(result, parser=etree.HTMLParser(encoding='utf-8')) tr_list = data.xpath("//table[@id ='table_xzxk']//tr[name = 'xzxk']") info = {} for i, singledata in enumerate(tr_list): td_list = singledata.xpath("./td") if len(td_list) == 0: continue temp = {} # number = deal_html_code.remove_symbol(td_list[0].xpath("string(.)")) temp["name"] = '' temp["code"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) temp["filename"] = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) start_date = deal_html_code.remove_symbol( td_list[3].xpath("string(.)")) temp["start_date"] = deal_html_code.change_chinese_date(start_date) end_date = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) temp["end_date"] = deal_html_code.change_chinese_date(end_date) temp["gov_dept"] = deal_html_code.remove_symbol( td_list[6].xpath("string(.)")) temp["content"] = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) info[i] = temp return info
def get_list(string): info = {} flag = 0 try: headers = config.headers_index content, status_code = Send_Request().send_request( config.index_url, headers) if status_code == 200: result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) id = result.xpath('//span[@class = "shouButton"]/@onclick')[0] pattern = re.compile(".*QueryIndex\('','(.*?)'\).*") match_id = re.findall(pattern, id)[0] url = config.list_url.format(match_id) #随机生成UA a = random.randrange(1, 1001) # 1-1000中生成随机数 headers = config.headers params = config.list_parmas.format(string) theline = linecache.getline(r'user-agent.txt', a) theline = theline.replace("\n", '') headers["User-Agent"] = theline result = requests.post(url, params, headers=headers) status_code = result.status_code s = chardet.detect(result.content)["encoding"] if status_code == 200 and s == 'utf-8': pattern = re.compile(u".*无查询结果.*|.*访问频繁.*|.*访问异常.*") match = re.findall(pattern, result.content) if len(match) == 0: content = etree.HTML( result.content, parser=etree.HTMLParser(encoding='utf-8')) list = content.xpath("//li") for i, single in enumerate(list): item = single.xpath(".//a/@href")[0] url = config.host + item info[i] = url flag = 1 else: flag = 100000003 else: flag = 100000004 else: flag = 10000004 except Exception, e: logging.error("search error:%s" % e) flag = 100000004
def get_info(self,url, url_pattern): result, status_code = Send_Request().send_requests(url) data = json.loads(result)["data"] if status_code == 200: if len(data) == 0: data = None logging.info('暂无 %s' % url_pattern) return data
def get_detail(info): detaillist = {} for key in info.keys(): url = info[key] content, status_code = Send_Request().send_request(url) if status_code == 200: detaillist[key] = deal_single_info(content) time.sleep(0.5) return detaillist
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] info = {} if "企业名称" in content: pattern = re.compile(".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(dl, info, j) else: j = 0 deal_single_info(dl, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) for k in xrange(2, totalpage + 1): href = out_invest_url.format(entid, cid, k) content, status_code = Send_Request().send_request( href, headers) if status_code == 200: start = (k - 1) * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] deal_single_info(dl, info, start) else: pass else: flag = 100000004 else: flag = 100000004 if flag == 1: info = deal_html_code.remove_repeat(info) return info, flag
def get_detail(info): detaillist = {} for key in info.keys(): url = info[key] # print url headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) # print status_code if status_code == 200: detaillist[key] = deal_single_info(content) time.sleep(0.5) return detaillist
def get_deatail_info(self, detail_url, info): dict = { u"执行事项": "items", u"裁定书文号": "rule_no", u"证照种类": "cert_cate", u"证照号码": "cert_code", u"冻结期限自": "start_date", u"冻结期限至": "end_date", u"冻结期限": "period", u"公示日期": "pub_date" } headers = config.headers result, status_code = Send_Request().send_requests(detail_url, headers) if status_code == 200: data = result.xpath(result, parser=etree.HTMLParser(encoding='utf-8')) for key, value in dict: content = deal_html_code.get_match_info(key, data) info[value] = content else: logging.info("获取司法协助详情信息失败!")
def get_url_list(url): result, status_code = Send_Request().send_requests(url) pattern = re.compile(".*返回首页.*|.*'/index/invalidLink'.*") fail = re.findall(pattern, result) if status_code == 200 and len(fail) == 0: information, flag = get_basic_info(result, status_code) url = get_singleinfo_url(result) else: information = None flag = 100000004 url = {} return information,flag,url
def get_html_data(url, print_url): headers = config.headers info = {} result, status_code = Send_Request().send_requests(url, headers) if status_code == 200: flag = 1 data = etree.HTML(result, parser=etree.HTMLParser(encoding='utf-8')) for key, value in dict.iteritems(): info[value] = deal_html_code.match_info(key, data) if info["shareholder1"] != '': info["shareholder"] = info["shareholder1"] elif info["shareholder2"] != '': info["shareholder"] = info["shareholder2"] elif info["shareholder3"] != '': info["shareholder"] = info["shareholder3"] # 最后删除不要的键值,以防下面循环去键值对应信息时出错 del info["shareholder1"] del info["shareholder2"] del info["shareholder3"] else: flag = 100000004 print '获取基本信息失败!' print_info, status_code = Send_Request().send_requests(print_url, headers) if status_code == 200: print_data = etree.HTML(print_info, parser=etree.HTMLParser(encoding='utf-8')) string = u'人员信息' info["person"] = deal_html_code.match_info(string, print_data) if info["person"] == '': string = u'成员信息' info["person"] = deal_html_code.match_info(string, print_data) string = u'分支机构' info["branch"] = deal_html_code.match_info(string, print_data) # 将整个打印页的内容先赋值给info["report1],传递给Report类,report类根据年份查找对应年份的信息 info["report1"] = print_data return info, flag
def get_preport_url(self, anCheId): info = {} url = host + '/corp-query-entprise-info-vAnnualPbReportBaseInfoForJs-%s.html' % anCheId result, status_code = Send_Request().send_requests(url) if status_code == 200: data = json.loads(result) vannualSfcAssertUrl = host + data["vAnnPbAssetUrl"] webSiteInfoUrl = host + data["webSiteInfoUrl"] + "?entType=17" # annSfcSocsecinfoUrl = host+data["annSfcSocsecinfoUrl"] annulLicenceUrl = host + data["annulLicenceUrl"] + "?entType=17" info["permit"] = annulLicenceUrl info["web"] = webSiteInfoUrl # info["society"] = annSfcSocsecinfoUrl info["base"] = vannualSfcAssertUrl return info
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class ='viewBox']//dl")[0] datalist = etree.tostring(dl).split( '<dd style="border-top:1px dashed #ccc;">') datalist.remove(datalist[-1]) for i, single in enumerate(datalist): single = etree.HTML(single, parser=etree.HTMLParser(encoding="utf-8")) if u"登记编号" in content: string = u'登记编号' code = self.deal_dd_content(string, single) else: code = None if u"登记日期" in content: string = u"登记日期" dates = self.deal_dd_content(string, single) else: dates = '0000-00-00' if u"登记机关" in content: string = u"登记机关" dept = self.deal_dd_content(string, single) else: dept = None string = u"抵押权人名称" person_name = self.deal_dd_content(string, single) string = u"抵押权人注册号" number = self.deal_dd_content(string, single) string = u"被担保债权种类" cates = self.deal_dd_content(string, single) string = u"被担保债权数额" amount = self.deal_dd_content(string, single) string = u"担保范围" ranges = self.deal_dd_content(string, single) string = u"履行债务开始日期" start_date = self.deal_dd_content(string, single) string = u"履行债务结束日期" end_date = self.deal_dd_content(string, single) period = start_date + '至' + end_date info[i] = [ code, dates, dept, person_name, number, cates, amount, ranges, period ] return info