def getData(self, docid): self.resetProxy() url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + docid referer = "http://wenshu.court.gov.cn/content/content?DocID=" + docid + "&KeyWord=" header_1 = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", } while True: try: data = curlData(url, referer=referer, header=header_1, proxy_ip=self.proxy_ip, timeout=5) break except Exception as e: debug("数据获取出错,重新获取", True) self.resetProxy() tmp = data tmp = re.findall("dirData = ([\w\W]*?)};", tmp) tmp = tmp[0] + "}" tmp = curlData("http://127.0.0.1:3000/handleFlfg", {"data": tmp}) tmp = json.loads(tmp) # noinspection PyBroadException try: tmp['legislative_authority'] = json.dumps( tmp['legislative_authority']) except: tmp['legislative_authority'] = "" return self.handleWsData(data, tmp)
def __get_url_curl(self, post, referer): """ :param post: :param referer: :return: """ headers = { "user-agent": getUserAgent(), "origin": "https://www.pelisplay.tv", "referer": referer } url = "https://www.pelisplay.tv/entradas/procesar_player" data = curlData(url, value=post, cookie=self.cookie, header=headers) try: data = json.loads(data) except Exception as e: lock.acquire() self.get_cookie() lock.release() if self.cookie_get_num < 3: return self.__get_url_curl(post, referer=referer) else: data = {"estado": 500} debug("播放链接获取出错,错误信息:{error}".format(error=e)) if data['estado'] == 200: data = data['data'] else: data = "" self.cookie_get_num = 0 return data
def getProxyIp(self): """ 进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用 :return: """ url = self.config['proxy_ip_url'] # url = "https://nl.tan90.club/test/testHeader.html" try: data = curlData(url, timeout=5) except: data = "" try: data = json.loads(data) if str(data['ERRORCODE']) == "10032": debug("proxy ip 今日提取量已达上限,结束程序", True) self.ip_over = 1 return 1 for k, v in enumerate(data['RESULT']): v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE" v['table'] = "proxy_ip" v['time_stamp'] = str(getNowTimeStamp()) tmp = k + 1 try: # 加锁 self.ws_db.insert(v, is_close_db=False) # 解锁 debug("第" + str(tmp) + "条ip插入成功") except Exception as e: debug("第" + str(tmp) + "条ip插入失败") except Exception as e: debug(e) debug("proxy ip 获取出错,睡眠5秒") sleep(5)
def getConstitutionList(self, cur_page): url = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action" referer = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action" post = { "pagesize": "20", "pageCount": "500", "curPage": cur_page, "resultSearch": "false", # "lastStrWhere": "+SFYX:(有效)++^+ZLSX:(01~02~03~04~05~06~08~09~10~11~12~23)+NOT+TXTID=bj+^+SFFB=Y+", "lastStrWhere": " SFYX:(有效~已被修正~失效) ^(ZLSX:1111 ~ZLSX=01) ^ BMFL:(03) ^ SFFB=Y ", "bt": "", "flfgnr": "", "sxx": "有效,已被修正,失效", # "sxx": "有效", "zlsxid": "12", "bmflid": "", "xldj": "", "bbrqbegin": "2018-09-01", "bbrqend": "2018-12-17", "sxrqbegin": "", "sxrqend": "", "zdjg": "", "bbwh": "" } header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } data = curlData(url=url, value=post, referer=referer, header=header) return data
def get_data(cls): """ :return: """ url = settings.DOMAIN data = curlData(url, open_virtual_ip=True) return data
def get_data(cls, item): """ :param item: :return: """ url = CommonFunc().generate_content_url(item['url']) data = curlData(url, open_virtual_ip=True) return data
def get_data(self, item): """ :param item: :return: """ url = item['url'] page_resource = curlData(url, cookie=self.cookie, open_virtual_ip=True) return page_resource
def get_data(cls, category): """ :param category: :return: """ generate_url = GenerateUrl() url = generate_url.generate_url(domian=category) page_resource = curlData(url, open_virtual_ip=True) return page_resource
def __handle_category(cls): """ :return: """ url = CommonFunc().generate_url() page_resource = curlData(url, open_virtual_ip=True) bs_data = BeautifulSoup(page_resource, "html.parser") category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"}) # only get the next level's li(tag), not include offspring(need to add 'recursive=False') return category_ul[0].find_all("li", recursive=False)
def getNumAndGuid(self): """ :return: """ try: header_1 = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "Origin": "http://wenshu.court.gov.cn/" } guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0") guid = json.loads(guid) guid = guid['guid'] num_flag = 0 try: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", value={"guid": guid}, referer="http://wenshu.court.gov.cn", header=header_1, proxy_ip=self.proxy_ip, timeout=5) except Exception as e: number = "remind" while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1: if num_flag > 3: self.resetProxyIp() num_flag = 0 else: num_flag = num_flag + 1 debug("number获取出错,继续获取", True) try: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", value={"guid": guid}, referer="http://wenshu.court.gov.cn", header=header_1, proxy_ip=self.proxy_ip, timeout=5) except Exception as e: debug(e, True) sleep(0.5) except Exception as e: debug("guid获取出错") return self.getNumAndGuid() return {"guid": guid, "number": number}
def getNumber(self, guid, header_1, proxy_ip, referer="http://wenshu.court.gov.cn", cookie=False): num_flag = 0 try: if not cookie: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer=referer, proxy_ip=proxy_ip, timeout=5) else: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer=referer, cookie=cookie, proxy_ip=proxy_ip, timeout=5) except: number = "remind" while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1: if num_flag > 4: proxy_ip = GetProxyIp().getProxyIp() num_flag = 0 else: num_flag = num_flag + 1 debug("number获取出错,继续获取") try: guidGet = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0") guid = guidGet['guid'] except: pass try: if not cookie: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer=referer, proxy_ip=proxy_ip, timeout=5) else: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer=referer, cookie=cookie, proxy_ip=proxy_ip, timeout=5) except Exception as e: debug(e) sleep(0.5)
def moveDocid(start): url = "http://api.tan90.club/ws_api/getWsDocid.html?start=" + str(start) while True: try: data = curlData(url=url, timeout=5) data = json.loads(data) data = data['data'] break except Exception as e: debug(e, True) sleep(2) for k, v in enumerate(data): v['table'] = "ws_docid" v['id'] = "NEXT VALUE FOR LAW_WS_DOCID_SEQUENCE" ws_db.insert(v, is_close_db=False) debug("文书 %s => 迁移成功" % v['docid'], True)
def run(self): last_id_list = self.ws_db.select( { "table": "ws_handle_where", "limit": [0, 1] }, is_close_db=False) try: last_id = last_id_list[0]['ws_id'] end_id = last_id + 100 except: exit(1) selectArr = { "table": "ws_origin_content", "condition": ['"id">=%s and "id"<%s' % (str(last_id), str(end_id))], "limit": [0, 100] } data = self.ws_db.select(selectArr, is_close_db=False) debug(str(last_id) + " - " + str(end_id)) try: last_id_list[0]['table'] = "ws_handle_where" last_id_list[0]['ws_id'] = end_id self.ws_db.insert(last_id_list[0], True) except: exit(2) for i, v in enumerate(data): tmp = v['content'] tmp = re.findall("dirData = ([\w\W]*?)};", tmp) tmp = tmp[0] + "}" tmp = curlData("http://www.wsapi.com/handleFlfg", {"data": tmp}) tmp = json.loads(tmp) # noinspection PyBroadException try: tmp['legislative_authority'] = json.dumps( tmp['legislative_authority']) except: tmp['legislative_authority'] = "" result = self.handleWsData(v['content'], tmp) if result == 3: debug("第" + str(i) + "条数据处理出错,进入待处理数据库", True) try: errorArr = dict() errorArr['table'] = "handle_error_id" errorArr['id'] = v['id'] self.ws_db.insert(errorArr, is_close_db=False) except Exception as e: debug(e, True)
def __handle_data(self, item): url = "https://www.pelisplay.tv" + item['img_src'] header = { # "Referer": "https://www.pelisplay.tv/", "User-Agent": getUserAgent(), "Accept": "image/webp,image/apng,image/*,*/*;q=0.8" } data = curlData(url, header=header) with open("static/images/{id}.jpg".format(id=item['id']), "wb") as f: try: data = data.encode("utf-8") except Exception as e: debug(e) f.write(data) self.__update_data(item) f.close() return {"code": 0}
def get_data(self, page): """ :param page: :return: """ global lock url = CommonFunc().generate_url(page, self.category['keyword']) # 获取数据 page_resource = curlData(url, open_virtual_ip=True) # with open("tmp/recipe_list.txt", "rb") as f: # page_resource = f.read().decode("utf-8") # f.close() # 处理并存储数据 self.handle_data(page_resource) lock.acquire() self.handle_num = self.handle_num + 1 lock.release() return {"code": 0, "page": page}
def get_image(): url = request.values.get("url") if url is not None: domain = "https://www.pelisplay.tv" url = urllib.parse.unquote(url) final_url = domain + url header = { "Referer": "https://www.pelisplay.tv/", "User-Agent": getUserAgent(), "Accept": "image/webp,image/apng,image/*,*/*;q=0.8" } data = curlData(final_url, header=header, open_virtual_ip=True) ext = re.findall("[\w\W]*?\.([\w\W]*.)", url)[0] else: ext = "jpg" data = "" # debug(data) return Response(data, mimetype=get_image_type(ext))
def __get_video_src(self, item): header = { # "Referer": "http://www.wyysdsa.com/", "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } # url = "http://zeus.pelisplay.tv/embed/vip.php?u=Q1A5NUZJM1VDTWlUTk8wTEFmWGNQZDhnbWRIcmt6UVU0VGIxakpXOUF4Mi9yZW51Zi9yaXZlcXFoYnlwL3picC5hYm1uem4uampqLy86ZmNnZ3U&fondo_requerido=" # url = "https://nl.tan90.club/test/testHeader.html" data = curlData(url=item['url'], header=header, cookie=self.cookie) # with open("tmp/content_detail.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() try: src = re.findall("JSON\.parse\('([\w\W]*?)'\)\);", data)[0] src = src.replace("\\", "") src = json.loads(src) src = src[0]['file'] except Exception as e: src = "" debug(e) return src
def __get_recipe_page_data(self, url, recipe_category_id): """ :param url: :param recipe_category_id: :return: """ page_resource = curlData(url, open_virtual_ip=True) # with open("tmp/category_page_data.txt", "rb") as f: # page_resource = f.read().decode("utf-8") # f.close() bs = BeautifulSoup(page_resource, "html.parser") page_ul = bs.find_all("ul", attrs={"class": "page-numbers"}) # remove prev page and next page for k, v in enumerate(page_ul[0]('a', attrs={"class": "next"})): v.extract() page_a = page_ul[0].find_all("a") page_span = page_ul[0].find("span") page_list = "" for k, v in enumerate(page_a): if k == 0: page_list = page_list + str(v.get_text()).strip() else: page_list = page_list + "," + str(v.get_text()).strip() page_list = page_list + "," + page_span.get_text().strip() page_list = {"page_list": page_list} # update to mysql update_arr = { "table": "type", "set": { "page_num": json.dumps(page_list) }, "condition": ['id={id}'.format(id=recipe_category_id)] } result = self.db.update(update_arr, is_close_db=False) if result == 1: debug("id为{id}的菜谱类型页面数据抓取成功".format(id=recipe_category_id)) else: debug("id为{id}的菜谱类型页面数据抓取失败".format(id=recipe_category_id))
def getUrl(self): start_date = self.ws_db.select({"table": "ws_docid_record"}, is_close_db=False) start_date = start_date[0]['start_date'] start_date = getTimeStamp(start_date, "%Y-%m-%d") end_date = start_date + 432000 end_date = getDateTime(end_date, "%Y-%m-%d") court_name = "最高人民法院" try: guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0") guid = json.loads(guid) guid = guid['guid'] number = "" except Exception as e: debug(e, True) debug("guid获取出错") return self.getUrl() url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s&conditions=searchWord++CPRQ++%s%%20TO%%20%s" % ( number, guid, urllib.parse.quote( str(court_name)), urllib.parse.quote( "法院名称:%s" % court_name), urllib.parse.quote( str(start_date)), urllib.parse.quote(str(end_date))) return url
def getProxyIp(self): """ 进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用 :return: """ mylock.acquire() url = self.config['proxy_ip_url'] try: data = curlData(url, timeout=5) except: data = "" try: data = json.loads(data) if str(data['ERRORCODE']) == "10032": debug("proxy ip 今日提取量已达上限,结束线程", True) global stop_thread global status mylock.release() status = 2 stop_thread = True return 0 for k, v in enumerate(data['RESULT']): v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE" v['table'] = "proxy_ip" v['time_stamp'] = str(getNowTimeStamp()) tmpK = 1 + k try: self.ws_db.insert(v, is_close_db=False) debug("第" + str(tmpK) + "条ip插入成功") except Exception as e: debug("第" + str(tmpK) + "条ip插入失败") sleep(1) except Exception as e: debug("proxy ip 获取出错,睡眠5秒") sleep(5) mylock.release()
def getConstitutionData(url): # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数 flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到 flag = False # url = "http://210.82.32.100:8081/FLFG/flfgByID.action" # get = dict() # get['flfgID'] = flfgID # get['showDetailType'] = showDetailType # get['zlsxid'] = zlsxid # get['keyword'] = "" # get = urlencode(get) # url = url + "?" + get while True: try: data = curlData(url=url, value=url) break except: pass try: data = data.decode("utf-8") except: pass # with open("constitution.txt", "wb") as f: # f.write(data.encode("utf-8")) # f.close() # with open("constitution.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() handleDataAll = BeautifulSoup(data, "html.parser") handleData = handleDataAll.find_all("table") columns_list = [ 'type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date', 'timeliness' ] columns_name_list = [ '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:' ] # 获取头部基本信息 try: table_data = handleData[0].find_all("td") except: table_data = "数据获取出错" flag = True type_data = dict() type_data['url'] = url for k, v in enumerate(table_data): try: if (k + 1) % 2 == 1: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = table_data[ k + 1].getText().strip() except: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all( "div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str( handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = "" if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 return json.dumps(type_data)
type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all( "div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str( handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = "" if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 return json.dumps(type_data) if __name__ == "__main__": url = "http://law.npc.gov.cn:8081/FLFG/flfgByID.action?flfgID=37416210&keyword=&showDetailType=QW&zlsxid=01" data = curlData("http://api.tan90.club/ws_api/createParam.html?url=" + urllib.parse.quote(url)) # data = curlData("http://127.0.0.1:8000/ws_api/createParam.html?url=" + urllib.parse.quote(url)) # data = curlData(url, referer=url) debug(data)
def getYear(self, court_name): """ 拼装url同时得到cookie :param court_name: :return: 所有有数据的年份 """ # 获取guid header_1 = { "User-Agent": getUserAgent(index=self.user_agent_index), "Origin": "http://wenshu.court.gov.cn" } try: guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0") guid = json.loads(guid) guid = guid['guid'] # number = "" num_flag = 0 try: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer="http://wenshu.court.gov.cn", proxy_ip=self.proxy_ip, timeout=5) except: number = "remind" while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1: if num_flag > 4: self.setProxyIp() num_flag = 0 else: num_flag = num_flag + 1 debug("number获取出错,继续获取") try: number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, referer="http://wenshu.court.gov.cn", proxy_ip=self.proxy_ip, timeout=5) except Exception as e: debug(e) sleep(0.5) except Exception as e: debug("guid获取出错") return self.getYear(court_name) # 拼装url url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s" % ( number, guid, urllib.parse.quote(str(court_name)), urllib.parse.quote("法院名称:%s" % court_name)) cookie = self.getCookie(url) while cookie == 0: self.setProxyIp() cookie = self.getCookie(url) try: vjkl5 = cookie['vjkl5'] except: debug("vjkl5获取失败,重新获取") if self.is_change_proxy > 4: self.setProxyIp() self.is_change_proxy = 0 else: self.is_change_proxy = self.is_change_proxy + 1 return self.getYear(court_name) try: post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5) except: post = "{}" debug("post参数获取出错") post = json.loads(post) post['number'] = number post['guid'] = guid header = { "User-Agent": getUserAgent(index=self.user_agent_index), "Origin": "http://wenshu.court.gov.cn" } post['Param'] = "法院名称:%s" % court_name year = curlData("http://wenshu.court.gov.cn/List/TreeContent", post, url, cookie, header, self.proxy_ip, timeout=5) try: year = json.loads(year) year = json.loads(year) except: pass return year[4]['Child']
def get_page_resource(cls, url): data = curlData(url, open_virtual_ip=True) return data
def getUrlAndCookieCaseType(self, court_name, start_date, end_date, case_type): """ 拼装url同时得到cookie :param court_name: :param start_date: :param end_date: :param case_type :return: param 包含线程所需所有参数 (dict) """ # 获取guid header_1 = { "User-Agent": getUserAgent(index=self.user_agent_index), "Origin": "http://wenshu.court.gov.cn" } try: num_flag = 0 guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0") guid = json.loads(guid) guid = guid['guid'] number = "" # try: # number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, # referer="http://wenshu.court.gov.cn", # proxy_ip=self.proxy_ip, timeout=5) # except: # number = "remind" # while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1: # if num_flag > 4: # self.setProxyIp() # num_flag = 0 # else: # num_flag = num_flag + 1 # debug("number获取出错,继续获取") # try: # number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1, # referer="http://wenshu.court.gov.cn", # proxy_ip=self.proxy_ip, timeout=5) # except Exception as e: # debug(e) # sleep(0.5) except: debug("guid获取出错") return self.getUrlAndCookieCaseType(court_name, start_date, end_date, case_type) # 拼装url url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s&conditions=searchWord++CPRQ++%s%%20TO%%20%s&conditions=searchWord+%s+AJLX++%s" % ( number, guid, urllib.parse.quote(str(court_name)), urllib.parse.quote("法院名称:%s" % court_name), urllib.parse.quote(str(start_date)), urllib.parse.quote(str(end_date)), str(self.getCaseTypeIndex(case_type)), urllib.parse.quote(str("案件类型:%s" % case_type))) cookie = self.getCookie(url) while cookie == 0: self.setProxyIp() cookie = self.getCookie(url) try: vjkl5 = cookie['vjkl5'] except: debug("vjk5获取失败,重新获取") if self.is_change_proxy > 4: self.setProxyIp() self.is_change_proxy = 0 else: self.is_change_proxy = self.is_change_proxy + 1 return self.getUrlAndCookieCaseType(court_name, start_date, end_date, case_type) try: post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5) except: post = "{}" debug("post参数获取出错") post = json.loads(post) post['Order'] = "法院层级" post['Page'] = 20 post['number'] = "wens" post['Direction'] = "asc" header = { "User-Agent": getUserAgent(index=self.user_agent_index), "Origin": "http://wenshu.court.gov.cn" } param = { "vjkl5": vjkl5, "post": post, "header": header, "cookie_all": cookie, "url": url, "court_name": court_name, "start_date": start_date, "end_date": end_date } param['post']['Param'] = "法院名称:%s,裁判日期:%s TO %s,案件类型:%s" % (court_name, start_date, end_date, case_type) return param
def decrypt(self, result_data): """ :param result_data: :return: """ try: result_data = json.loads(result_data.replace("\n", "\/n")) result_data = json.loads(result_data) except Exception as e: if str(result_data).find("remind") != -1: debug("接口数据返回出错,返回数据为:" + str(result_data), True) result_data = list() post = dict() try: runEval = result_data[0]['RunEval'] # 本条件总共文书条数 try: count = int(result_data[0]['Count']) except: debug(result_data, True) return 2 except Exception as e: debug(result_data, True) debug(e, True) debug("RunEval获取出错", True) return 2 length = len(result_data) docId = "" insertList = list() for i in range(1, length): insertArr = dict() try: try: insertArr['title'] = result_data[i]['案件名称'] except: insertArr['title'] = "" try: insertArr['case_type'] = result_data[i]['案件类型'] except: insertArr['case_type'] = "" try: insertArr['cp_date'] = result_data[i]['裁判日期'] except: insertArr['cp_date'] = "" try: insertArr['court_name'] = result_data[i]['法院名称'] except: insertArr['court_name'] = "" try: insertArr['case_num'] = result_data[i]['案号'] except: insertArr['case_num'] = "" try: insertArr['content'] = result_data[i]['裁判要旨段原文'] except: insertArr['content'] = "" insertList.append(insertArr) except: pass try: if i >= length - 1: docId = docId + result_data[i]['文书ID'] else: docId = docId + result_data[i]['文书ID'] + "," except Exception as e: debug("文书ID拼接出错,错误未知,暂时判断为无此条目 :" + e.__str__(), True) return 2 if docId.strip() == "" and count != 0: return 4 try: post['runEval'] = runEval post['docId'] = docId result_data = curlData(self.config['get_docid_api_url'], value=post) result_data = json.loads(result_data) result_data = result_data['data'].split(",") # return result_data except Exception as e: debug(e, True) return 2 for k, v in enumerate(result_data): insertList[k]['table'] = "ws_docid" insertList[k]['docid'] = v return insertList
rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) from tool import phoenix_db from tool.function import curlData, debug if __name__ == "__main__": ws_db = phoenix_db.DBConfig() start = 153806 table_columns = ws_db.getColumns({"table": "ws_origin_content"}) url = "http://api.tan90.club/ws_api/getWsOriginContent.html?start=" + str(start) # url = "http://127.0.0.1:8000/ws_api/getWsOriginContent.html?start=" + str(start) while True: try: debug("开始获取", True) data = curlData(url=url, timeout=20) data = json.loads(data) data = data['data'] break except Exception as e: debug(e, True) # noinspection PyBroadException try: curlData("http://autostart.tan90.club/", timeout=5) except: pass while True: try: for k, v in enumerate(data): v['table'] = "ws_origin_content" tmp = v['id']
def getData(self): """ :return: a json """ self.resetProxyIp() numAndGuid = self.getNumAndGuid() url = "http://wenshu.court.gov.cn/list/list/?sorttype=1" + "&number=" + numAndGuid['number'] + "&guid=" + \ numAndGuid['guid'] header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } cookie_get_num = 0 while True: try: cookie = getCookie(url, header=header, referer="http://wenshu.court.gov.cn", proxy_ip=self.proxy_ip, timeout=5) break except: if cookie_get_num > 1: self.resetProxyIp() cookie_get_num = 0 cookie_get_num = cookie_get_num + 1 debug("cookie获取失败,继续获取", True) # noinspection PyBroadException try: vjkl5 = cookie['vjkl5'] except Exception as e: debug("vjk5获取失败,终止", True) return self.getData() try: post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5) except Exception as e: post = "{}" debug("post参数获取出错", True) post = json.loads(post) post['Order'] = "法院层级" post['Page'] = 10 post['number'] = numAndGuid['number'] post['Direction'] = "asc" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "Origin": "http://wenshu.court.gov.cn" } page = "1" param = { "vjkl5": vjkl5, "post": post, "header": header, "cookie_all": cookie, "url": url } param['post']['Param'] = self.param # 抓取数据 is_get_new_proxy_ip = 0 while True: try: param['post']['Index'] = page resultData = curlData("http://wenshu.court.gov.cn/List/ListContent", value=param['post'], referer=param['url'], cookie=param['cookie_all'], header=param['header'], proxy_ip=self.proxy_ip, timeout=5) break except Exception as e: if is_get_new_proxy_ip > 0: debug("重新获取代理ip", True) self.resetProxyIp() is_get_new_proxy_ip = 0 else: debug("第" + str(page) + "页列表获取出错,尝试重新获取,尝试次数:" + str(is_get_new_proxy_ip), True) is_get_new_proxy_ip = is_get_new_proxy_ip + 1 debug(e, True) if e.__str__().find("latin-1") != -1: debug(param, True) resultData = self.decrypt(resultData) return resultData
def getConstitutionData(flfgID, zlsxid, province): # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数 flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到 flag = False url = "http://law.npc.gov.cn:8081/FLFG/flfgByID.action" get = dict() get['flfgID'] = flfgID get['zlsxid'] = zlsxid get['keyword'] = "" get = urlencode(get) url = url + "?" + get data = curlData(url, get, url) try: data = data.decode("utf-8") except: pass # with open("constitution.txt", "wb") as f: # f.write(data.encode("utf-8")) # f.close() # with open("constitution.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() handleDataAll = BeautifulSoup(data, "html.parser") handleData = handleDataAll.find_all("table") columns_list = [ 'type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date', 'timeliness' ] columns_name_list = [ '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:' ] # 获取头部基本信息 try: table_data = handleData[0].find_all("td") except: table_data = "数据获取出错" flag = True type_data = dict() type_data['url'] = url for k, v in enumerate(table_data): try: if (k + 1) % 2 == 1: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = table_data[ k + 1].getText().strip() except: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all( "div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str( handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = province if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 DB = DBConfig() sql = DB.getInsertSql(type_data, "constitutions") result = DB.insert(sql) return result
def analysis(result_data): try: result_data = json.loads(result_data.replace("\n", "\/n")) except Exception as e: result_data = "" post = dict() try: runEval = result_data[0]['RunEval'] # 本条件总共文书条数 try: count = int(result_data[0]['Count']) # if count > 210: # return 5 # 如果条数为0,自然直接返回 if count == 0: debug(result_data) return 3 except: debug(result_data) return 2 except Exception as e: debug("RunEval获取出错", True) return 2 length = len(result_data) docId = "" insertList = list() for i in range(1, length): insertArr = dict() insertArr['table'] = "ws_docid" insertArr['id'] = "NEXT VALUE FOR LAW_WS_DOCID_SEQUENCE" try: try: insertArr['title'] = result_data[i]['案件名称'] except: insertArr['title'] = "" try: insertArr['case_type'] = result_data[i]['案件类型'] except: insertArr['case_type'] = "" try: insertArr['cp_date'] = result_data[i]['裁判日期'] except: insertArr['cp_date'] = "" try: insertArr['court_name'] = result_data[i]['法院名称'] except: insertArr['court_name'] = "" try: insertArr['case_num'] = result_data[i]['案号'] except: insertArr['case_num'] = "" try: insertArr['content'] = result_data[i]['裁判要旨段原文'] except: insertArr['content'] = "" insertList.append(insertArr) except: pass try: if i >= length - 1: docId = docId + result_data[i]['文书ID'] else: docId = docId + result_data[i]['文书ID'] + "," except Exception as e: debug("文书ID拼接出错,错误未知,暂时判断为无此条目 :" + e.__str__(), True) return 2 if docId.strip() == "" and count != 0: return 4 try: post['runEval'] = runEval post['docId'] = docId result_data = curlData("http://www.wsapi.com/getDocId", value=post) result_data = json.loads(result_data) result_data = result_data['data'].split(",") except Exception as e: debug(e, True) return 2 return result_data