async def req_detail(self, session, docid): '''获取每条案件详情''' try: detail_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=' + str( docid) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Cookie": self.new_cookie, "Referer": "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=%E6%A0%A1%E5%9B%AD%E8%B4%B7" .format(docid) } async with session.get( # 这里的timecout一定要设置 detail_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as response: if response.status == 200: detail_resp = await response.text() if "请开启JavaScript并刷新该页" in detail_resp: session = aiohttp.ClientSession( cookies=get_cookie_dict()) self.new_cookie = get_cookie() return await self.req_detail(session, docid) else: item = self.parse_detail(detail_resp) return item else: return "expect" except: return await self.req_detail(session, docid)
def req_detail(self, docid): '''获取每条案件详情''' try: detail_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=' + str( docid) response = requests.get( url=detail_url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Cookie": self.new_cookie, "Referer": "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=%E6%A0%A1%E5%9B%AD%E8%B4%B7" .format(docid) }, allow_redirects=False # proxies=proxies ) detail_resp = response.text # print("案件详情: "+detail_resp) if "请开启JavaScript并刷新该页" in detail_resp: self.new_cookie = get_cookie() return self.req_detail(docid) else: item = self.parse_detail(detail_resp) return item except: return self.req_detail(docid)
def get_list(self, index): '''获取一页的数据,index:第几页''' try: headers = { 'Connection': 'keep-alive', 'Cookie': self.new_cookie, 'Host': 'wenshu.court.gov.cn', 'Origin': 'http://wenshu.court.gov.cn', 'Referer': "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=T648SFX6", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', } vjkl5 = re.search('vjkl5=(.*?);', self.new_cookie).group(1) url = 'http://wenshu.court.gov.cn/List/ListContent' data = { 'Param': '全文检索:{}'.format(self.search_word), 'Index': str(index), 'Page': '20', # 最多20 'Order': '法院层级', 'Direction': 'asc', 'vl5x': self.vl5x_js.call('getKey', vjkl5), 'number': 'T648', # random.random(), 'guid': self.guid } res_list = requests.post(url=url, headers=headers, data=data) if "请开启JavaScript并刷新该页" in res_list.text: self.new_cookie = get_cookie() return self.get_list(index) elif len(eval(json.loads(res_list.text))[0]) < 2: return self.get_list(index) else: json_str = json.loads( res_list.text.strip('"').replace('\\', '')) # print(json_str) return json_str except Exception as e: print(res_list.text) print(traceback.format_exc()) return self.get_list(index)
def get_count(self): # 首先获取案件总数 try: headers = { 'Connection': 'keep-alive', 'Cookie': self.new_cookie, #scrapy中为字典形式 #headers中的cookie要求为字符串形式,将cookies单独写出来需要是字典的形式 'Host': 'wenshu.court.gov.cn', 'Origin': 'http://wenshu.court.gov.cn', 'Referer': "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=T648SFX6", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', } vjkl5 = re.search('vjkl5=(.*?);', self.new_cookie).group(1) #cookie为字符串形式 # vjkl5 = self.new_cookie['vjkl5'] #cookie为字典形式 url = 'http://wenshu.court.gov.cn/List/ListContent' vl5x = self.vl5x_js.call('getKey', vjkl5) data = { 'Param': '全文检索:{}'.format(self.search_word), 'Index': '1', 'Page': '1', 'Order': '法院层级', 'Direction': 'asc', 'vl5x': vl5x, 'number': 'T648', # random.random(), 'guid': self.guid } res = requests.post(url=url, headers=headers, data=data) json_str = json.loads(res.text.strip('"').replace('\\', '')) print(json_str) self.count = int(json_str[0][u'Count']) # print(self.count) return self.count except: # 失败则重新获取cookie请求 print("get_count expect") self.new_cookie = get_cookie() return self.get_count()
def __init__(self, search_word): with open('D:/File/Wenshu_bs/spider/get_vl5x.js', encoding='utf-8') as f: jsdata_1 = f.read() with open('D:/File/Wenshu_bs/spider/get_docid.js', encoding='utf-8') as f: jsdata_2 = f.read() with open('D:/File/Wenshu_bs/spider/get_guid.js', encoding='utf-8') as f: jsdata_3 = f.read() self.vl5x_js = execjs.compile(jsdata_1) self.docid_js = execjs.compile(jsdata_2) self.guid_js = execjs.compile(jsdata_3) self.guid = self.guid_js.call('getGuid') self.search_word = search_word # self.mt = mongoStore(self.search_word) self.new_cookie = get_cookie() self.count = self.get_count()
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', # } # url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=UVGSXWVJ&guid=aabc33f0-863f-b511e395-6b23ddeda3f3&conditions=searchWord+QWJS+++%E5%85%A8%E6%96%87%E6%A3%80%E7%B4%A2:%E6%A0%A1%E5%9B%AD%E8%B4%B7" # # headers = { # 'Connection': 'keep-alive', # # 'Cookie': self.new_cookie, # 'Host': 'wenshu.court.gov.cn', # 'Origin': 'http://wenshu.court.gov.cn', # 'Referer':"http://wenshu.court.gov.cn/", # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', # } # url2 = "http://wenshu.court.gov.cn" # res2 = requests.get(url2,headers=headers) # raw_func = re.findall(r'<script type="text/javascript">(.*)</script>',res2.text,re.DOTALL)[0] # print(res2.cookies) # print(res2.text) # print(raw_func) url3 = "http://wenshu.court.gov.cn/WZWSREL2NvbnRlbnQvY29udGVudD9Eb2NJRD0yNDU0MTc4MC0xY2Q4LTRjNGItYTk0Mi1hODUxMDBhYzYwNWQmS2V5V29yZD0lRTYlQTAlQTElRTUlOUIlQUQlRTglQjQlQjc=" headers = { 'Connection': 'keep-alive', 'Cookie': get_cookie(), 'Host': 'wenshu.court.gov.cn', 'Origin': 'http://wenshu.court.gov.cn', 'Referer':"http://wenshu.court.gov.cn/", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', } res3 = requests.get(url=url3,headers=headers) print(res3.headers) print(location = res3.headers['Location'])