Пример #1
0
 async def req_detail(self, session, docid):
     '''获取每条案件详情'''
     try:
         detail_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=' + str(
             docid)
         headers = {
             "User-Agent":
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
             "Cookie":
             self.new_cookie,
             "Referer":
             "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=%E6%A0%A1%E5%9B%AD%E8%B4%B7"
             .format(docid)
         }
         async with session.get(
                 # 这里的timecout一定要设置
                 detail_url,
                 headers=headers,
                 timeout=aiohttp.ClientTimeout(total=5)) as response:
             if response.status == 200:
                 detail_resp = await response.text()
                 if "请开启JavaScript并刷新该页" in detail_resp:
                     session = aiohttp.ClientSession(
                         cookies=get_cookie_dict())
                     self.new_cookie = get_cookie()
                     return await self.req_detail(session, docid)
                 else:
                     item = self.parse_detail(detail_resp)
                     return item
             else:
                 return "expect"
     except:
         return await self.req_detail(session, docid)
Пример #2
0
 def req_detail(self, docid):
     '''获取每条案件详情'''
     try:
         detail_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=' + str(
             docid)
         response = requests.get(
             url=detail_url,
             headers={
                 "User-Agent":
                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                 "Cookie":
                 self.new_cookie,
                 "Referer":
                 "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=%E6%A0%A1%E5%9B%AD%E8%B4%B7"
                 .format(docid)
             },
             allow_redirects=False
             # proxies=proxies
         )
         detail_resp = response.text
         # print("案件详情:  "+detail_resp)
         if "请开启JavaScript并刷新该页" in detail_resp:
             self.new_cookie = get_cookie()
             return self.req_detail(docid)
         else:
             item = self.parse_detail(detail_resp)
             return item
     except:
         return self.req_detail(docid)
Пример #3
0
    def get_list(self, index):
        '''获取一页的数据,index:第几页'''
        try:
            headers = {
                'Connection':
                'keep-alive',
                'Cookie':
                self.new_cookie,
                'Host':
                'wenshu.court.gov.cn',
                'Origin':
                'http://wenshu.court.gov.cn',
                'Referer':
                "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=T648SFX6",
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
            }
            vjkl5 = re.search('vjkl5=(.*?);', self.new_cookie).group(1)
            url = 'http://wenshu.court.gov.cn/List/ListContent'

            data = {
                'Param': '全文检索:{}'.format(self.search_word),
                'Index': str(index),
                'Page': '20',  # 最多20
                'Order': '法院层级',
                'Direction': 'asc',
                'vl5x': self.vl5x_js.call('getKey', vjkl5),
                'number': 'T648',  # random.random(),
                'guid': self.guid
            }

            res_list = requests.post(url=url, headers=headers, data=data)
            if "请开启JavaScript并刷新该页" in res_list.text:
                self.new_cookie = get_cookie()
                return self.get_list(index)
            elif len(eval(json.loads(res_list.text))[0]) < 2:
                return self.get_list(index)
            else:
                json_str = json.loads(
                    res_list.text.strip('"').replace('\\', ''))
                # print(json_str)
                return json_str
        except Exception as e:
            print(res_list.text)
            print(traceback.format_exc())
            return self.get_list(index)
Пример #4
0
    def get_count(self):
        # 首先获取案件总数
        try:
            headers = {
                'Connection':
                'keep-alive',
                'Cookie':
                self.new_cookie,  #scrapy中为字典形式
                #headers中的cookie要求为字符串形式,将cookies单独写出来需要是字典的形式
                'Host':
                'wenshu.court.gov.cn',
                'Origin':
                'http://wenshu.court.gov.cn',
                'Referer':
                "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=T648SFX6",
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
            }
            vjkl5 = re.search('vjkl5=(.*?);',
                              self.new_cookie).group(1)  #cookie为字符串形式
            # vjkl5 = self.new_cookie['vjkl5'] #cookie为字典形式
            url = 'http://wenshu.court.gov.cn/List/ListContent'
            vl5x = self.vl5x_js.call('getKey', vjkl5)

            data = {
                'Param': '全文检索:{}'.format(self.search_word),
                'Index': '1',
                'Page': '1',
                'Order': '法院层级',
                'Direction': 'asc',
                'vl5x': vl5x,
                'number': 'T648',  # random.random(),
                'guid': self.guid
            }
            res = requests.post(url=url, headers=headers, data=data)
            json_str = json.loads(res.text.strip('"').replace('\\', ''))
            print(json_str)
            self.count = int(json_str[0][u'Count'])
            # print(self.count)
            return self.count
        except:
            # 失败则重新获取cookie请求
            print("get_count expect")
            self.new_cookie = get_cookie()
            return self.get_count()
Пример #5
0
    def __init__(self, search_word):

        with open('D:/File/Wenshu_bs/spider/get_vl5x.js',
                  encoding='utf-8') as f:
            jsdata_1 = f.read()
        with open('D:/File/Wenshu_bs/spider/get_docid.js',
                  encoding='utf-8') as f:
            jsdata_2 = f.read()
        with open('D:/File/Wenshu_bs/spider/get_guid.js',
                  encoding='utf-8') as f:
            jsdata_3 = f.read()
        self.vl5x_js = execjs.compile(jsdata_1)
        self.docid_js = execjs.compile(jsdata_2)
        self.guid_js = execjs.compile(jsdata_3)
        self.guid = self.guid_js.call('getGuid')
        self.search_word = search_word
        # self.mt = mongoStore(self.search_word)
        self.new_cookie = get_cookie()
        self.count = self.get_count()
Пример #6
0
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
#         }
# url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=UVGSXWVJ&guid=aabc33f0-863f-b511e395-6b23ddeda3f3&conditions=searchWord+QWJS+++%E5%85%A8%E6%96%87%E6%A3%80%E7%B4%A2:%E6%A0%A1%E5%9B%AD%E8%B4%B7"
#
# headers = {
#             'Connection': 'keep-alive',
#             # 'Cookie': self.new_cookie,
#             'Host': 'wenshu.court.gov.cn',
#             'Origin': 'http://wenshu.court.gov.cn',
#             'Referer':"http://wenshu.court.gov.cn/",
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
#         }
# url2 = "http://wenshu.court.gov.cn"
# res2 = requests.get(url2,headers=headers)
# raw_func = re.findall(r'<script type="text/javascript">(.*)</script>',res2.text,re.DOTALL)[0]
# print(res2.cookies)
# print(res2.text)
# print(raw_func)

url3 = "http://wenshu.court.gov.cn/WZWSREL2NvbnRlbnQvY29udGVudD9Eb2NJRD0yNDU0MTc4MC0xY2Q4LTRjNGItYTk0Mi1hODUxMDBhYzYwNWQmS2V5V29yZD0lRTYlQTAlQTElRTUlOUIlQUQlRTglQjQlQjc="
headers = {
            'Connection': 'keep-alive',
            'Cookie': get_cookie(),
            'Host': 'wenshu.court.gov.cn',
            'Origin': 'http://wenshu.court.gov.cn',
            'Referer':"http://wenshu.court.gov.cn/",
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
        }
res3 = requests.get(url=url3,headers=headers)
print(res3.headers)
print(location = res3.headers['Location'])