def getData(self, docid):
     self.resetProxy()
     url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + docid
     referer = "http://wenshu.court.gov.cn/content/content?DocID=" + docid + "&KeyWord="
     header_1 = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
     }
     while True:
         try:
             data = curlData(url,
                             referer=referer,
                             header=header_1,
                             proxy_ip=self.proxy_ip,
                             timeout=5)
             break
         except Exception as e:
             debug("数据获取出错,重新获取", True)
             self.resetProxy()
     tmp = data
     tmp = re.findall("dirData = ([\w\W]*?)};", tmp)
     tmp = tmp[0] + "}"
     tmp = curlData("http://127.0.0.1:3000/handleFlfg", {"data": tmp})
     tmp = json.loads(tmp)
     # noinspection PyBroadException
     try:
         tmp['legislative_authority'] = json.dumps(
             tmp['legislative_authority'])
     except:
         tmp['legislative_authority'] = ""
     return self.handleWsData(data, tmp)
 def __get_url_curl(self, post, referer):
     """
     :param post:
     :param referer:
     :return:
     """
     headers = {
         "user-agent": getUserAgent(),
         "origin": "https://www.pelisplay.tv",
         "referer": referer
     }
     url = "https://www.pelisplay.tv/entradas/procesar_player"
     data = curlData(url, value=post, cookie=self.cookie, header=headers)
     try:
         data = json.loads(data)
     except Exception as e:
         lock.acquire()
         self.get_cookie()
         lock.release()
         if self.cookie_get_num < 3:
             return self.__get_url_curl(post, referer=referer)
         else:
             data = {"estado": 500}
             debug("播放链接获取出错,错误信息:{error}".format(error=e))
     if data['estado'] == 200:
         data = data['data']
     else:
         data = ""
     self.cookie_get_num = 0
     return data
示例#3
0
 def getProxyIp(self):
     """
     进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用
     :return:
     """
     url = self.config['proxy_ip_url']
     # url = "https://nl.tan90.club/test/testHeader.html"
     try:
         data = curlData(url, timeout=5)
     except:
         data = ""
     try:
         data = json.loads(data)
         if str(data['ERRORCODE']) == "10032":
             debug("proxy ip 今日提取量已达上限,结束程序", True)
             self.ip_over = 1
             return 1
         for k, v in enumerate(data['RESULT']):
             v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE"
             v['table'] = "proxy_ip"
             v['time_stamp'] = str(getNowTimeStamp())
             tmp = k + 1
             try:
                 # 加锁
                 self.ws_db.insert(v, is_close_db=False)
                 # 解锁
                 debug("第" + str(tmp) + "条ip插入成功")
             except Exception as e:
                 debug("第" + str(tmp) + "条ip插入失败")
     except Exception as e:
         debug(e)
         debug("proxy ip 获取出错,睡眠5秒")
         sleep(5)
示例#4
0
 def getConstitutionList(self, cur_page):
     url = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action"
     referer = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action"
     post = {
         "pagesize": "20",
         "pageCount": "500",
         "curPage": cur_page,
         "resultSearch": "false",
         # "lastStrWhere": "+SFYX:(有效)++^+ZLSX:(01~02~03~04~05~06~08~09~10~11~12~23)+NOT+TXTID=bj+^+SFFB=Y+",
         "lastStrWhere": "  SFYX:(有效~已被修正~失效) ^(ZLSX:1111 ~ZLSX=01)  ^ BMFL:(03)  ^ SFFB=Y ",
         "bt": "",
         "flfgnr": "",
         "sxx": "有效,已被修正,失效",
         # "sxx": "有效",
         "zlsxid": "12",
         "bmflid": "",
         "xldj": "",
         "bbrqbegin": "2018-09-01",
         "bbrqend": "2018-12-17",
         "sxrqbegin": "",
         "sxrqend": "",
         "zdjg": "",
         "bbwh": ""
     }
     header = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
     }
     data = curlData(url=url, value=post, referer=referer, header=header)
     return data
示例#5
0
 def get_data(cls):
     """
     :return:
     """
     url = settings.DOMAIN
     data = curlData(url, open_virtual_ip=True)
     return data
示例#6
0
 def get_data(cls, item):
     """
     :param item:
     :return:
     """
     url = CommonFunc().generate_content_url(item['url'])
     data = curlData(url, open_virtual_ip=True)
     return data
 def get_data(self, item):
     """
     :param item:
     :return:
     """
     url = item['url']
     page_resource = curlData(url, cookie=self.cookie, open_virtual_ip=True)
     return page_resource
示例#8
0
 def get_data(cls, category):
     """
     :param category:
     :return:
     """
     generate_url = GenerateUrl()
     url = generate_url.generate_url(domian=category)
     page_resource = curlData(url, open_virtual_ip=True)
     return page_resource
示例#9
0
 def __handle_category(cls):
     """
     :return:
     """
     url = CommonFunc().generate_url()
     page_resource = curlData(url, open_virtual_ip=True)
     bs_data = BeautifulSoup(page_resource, "html.parser")
     category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"})
     # only get the next level's li(tag), not include offspring(need to add 'recursive=False')
     return category_ul[0].find_all("li", recursive=False)
示例#10
0
 def getNumAndGuid(self):
     """
     :return:
     """
     try:
         header_1 = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
             "Origin": "http://wenshu.court.gov.cn/"
         }
         guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0")
         guid = json.loads(guid)
         guid = guid['guid']
         num_flag = 0
         try:
             number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", value={"guid": guid},
                               referer="http://wenshu.court.gov.cn", header=header_1,
                               proxy_ip=self.proxy_ip, timeout=5)
         except Exception as e:
             number = "remind"
         while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1:
             if num_flag > 3:
                 self.resetProxyIp()
                 num_flag = 0
             else:
                 num_flag = num_flag + 1
             debug("number获取出错,继续获取", True)
             try:
                 number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", value={"guid": guid},
                                   referer="http://wenshu.court.gov.cn",
                                   header=header_1,
                                   proxy_ip=self.proxy_ip, timeout=5)
             except Exception as e:
                 debug(e, True)
             sleep(0.5)
     except Exception as e:
         debug("guid获取出错")
         return self.getNumAndGuid()
     return {"guid": guid, "number": number}
示例#11
0
 def getNumber(self, guid, header_1, proxy_ip, referer="http://wenshu.court.gov.cn", cookie=False):
     num_flag = 0
     try:
         if not cookie:
             number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                               referer=referer,
                               proxy_ip=proxy_ip, timeout=5)
         else:
             number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                               referer=referer, cookie=cookie,
                               proxy_ip=proxy_ip, timeout=5)
     except:
         number = "remind"
     while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1:
         if num_flag > 4:
             proxy_ip = GetProxyIp().getProxyIp()
             num_flag = 0
         else:
             num_flag = num_flag + 1
         debug("number获取出错,继续获取")
         try:
             guidGet = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0")
             guid = guidGet['guid']
         except:
             pass
         try:
             if not cookie:
                 number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                                   referer=referer,
                                   proxy_ip=proxy_ip, timeout=5)
             else:
                 number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                                   referer=referer, cookie=cookie,
                                   proxy_ip=proxy_ip, timeout=5)
         except Exception as e:
             debug(e)
         sleep(0.5)
示例#12
0
def moveDocid(start):
    url = "http://api.tan90.club/ws_api/getWsDocid.html?start=" + str(start)
    while True:
        try:
            data = curlData(url=url, timeout=5)
            data = json.loads(data)
            data = data['data']
            break
        except Exception as e:
            debug(e, True)
            sleep(2)
    for k, v in enumerate(data):
        v['table'] = "ws_docid"
        v['id'] = "NEXT VALUE FOR LAW_WS_DOCID_SEQUENCE"
        ws_db.insert(v, is_close_db=False)
        debug("文书 %s => 迁移成功" % v['docid'], True)
示例#13
0
 def run(self):
     last_id_list = self.ws_db.select(
         {
             "table": "ws_handle_where",
             "limit": [0, 1]
         }, is_close_db=False)
     try:
         last_id = last_id_list[0]['ws_id']
         end_id = last_id + 100
     except:
         exit(1)
     selectArr = {
         "table": "ws_origin_content",
         "condition":
         ['"id">=%s and "id"<%s' % (str(last_id), str(end_id))],
         "limit": [0, 100]
     }
     data = self.ws_db.select(selectArr, is_close_db=False)
     debug(str(last_id) + " - " + str(end_id))
     try:
         last_id_list[0]['table'] = "ws_handle_where"
         last_id_list[0]['ws_id'] = end_id
         self.ws_db.insert(last_id_list[0], True)
     except:
         exit(2)
     for i, v in enumerate(data):
         tmp = v['content']
         tmp = re.findall("dirData = ([\w\W]*?)};", tmp)
         tmp = tmp[0] + "}"
         tmp = curlData("http://www.wsapi.com/handleFlfg", {"data": tmp})
         tmp = json.loads(tmp)
         # noinspection PyBroadException
         try:
             tmp['legislative_authority'] = json.dumps(
                 tmp['legislative_authority'])
         except:
             tmp['legislative_authority'] = ""
         result = self.handleWsData(v['content'], tmp)
         if result == 3:
             debug("第" + str(i) + "条数据处理出错,进入待处理数据库", True)
             try:
                 errorArr = dict()
                 errorArr['table'] = "handle_error_id"
                 errorArr['id'] = v['id']
                 self.ws_db.insert(errorArr, is_close_db=False)
             except Exception as e:
                 debug(e, True)
示例#14
0
 def __handle_data(self, item):
     url = "https://www.pelisplay.tv" + item['img_src']
     header = {
         # "Referer": "https://www.pelisplay.tv/",
         "User-Agent": getUserAgent(),
         "Accept": "image/webp,image/apng,image/*,*/*;q=0.8"
     }
     data = curlData(url, header=header)
     with open("static/images/{id}.jpg".format(id=item['id']), "wb") as f:
         try:
             data = data.encode("utf-8")
         except Exception as e:
             debug(e)
         f.write(data)
         self.__update_data(item)
         f.close()
     return {"code": 0}
示例#15
0
 def get_data(self, page):
     """
     :param page:
     :return:
     """
     global lock
     url = CommonFunc().generate_url(page, self.category['keyword'])
     # 获取数据
     page_resource = curlData(url, open_virtual_ip=True)
     # with open("tmp/recipe_list.txt", "rb") as f:
     #     page_resource = f.read().decode("utf-8")
     #     f.close()
     # 处理并存储数据
     self.handle_data(page_resource)
     lock.acquire()
     self.handle_num = self.handle_num + 1
     lock.release()
     return {"code": 0, "page": page}
示例#16
0
def get_image():
    url = request.values.get("url")
    if url is not None:
        domain = "https://www.pelisplay.tv"
        url = urllib.parse.unquote(url)
        final_url = domain + url
        header = {
            "Referer": "https://www.pelisplay.tv/",
            "User-Agent": getUserAgent(),
            "Accept": "image/webp,image/apng,image/*,*/*;q=0.8"
        }
        data = curlData(final_url, header=header, open_virtual_ip=True)
        ext = re.findall("[\w\W]*?\.([\w\W]*.)", url)[0]
    else:
        ext = "jpg"
        data = ""
    # debug(data)
    return Response(data, mimetype=get_image_type(ext))
示例#17
0
 def __get_video_src(self, item):
     header = {
         # "Referer": "http://www.wyysdsa.com/",
         "User-Agent": getUserAgent(),
         # "Cache-Control": "max-age=0",
         # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
     }
     # url = "http://zeus.pelisplay.tv/embed/vip.php?u=Q1A5NUZJM1VDTWlUTk8wTEFmWGNQZDhnbWRIcmt6UVU0VGIxakpXOUF4Mi9yZW51Zi9yaXZlcXFoYnlwL3picC5hYm1uem4uampqLy86ZmNnZ3U&fondo_requerido="
     # url = "https://nl.tan90.club/test/testHeader.html"
     data = curlData(url=item['url'], header=header, cookie=self.cookie)
     # with open("tmp/content_detail.txt", "rb") as f:
     #     data = f.read().decode("utf-8")
     #     f.close()
     try:
         src = re.findall("JSON\.parse\('([\w\W]*?)'\)\);", data)[0]
         src = src.replace("\\", "")
         src = json.loads(src)
         src = src[0]['file']
     except Exception as e:
         src = ""
         debug(e)
     return src
示例#18
0
 def __get_recipe_page_data(self, url, recipe_category_id):
     """
     :param url:
     :param recipe_category_id:
     :return:
     """
     page_resource = curlData(url, open_virtual_ip=True)
     # with open("tmp/category_page_data.txt", "rb") as f:
     #     page_resource = f.read().decode("utf-8")
     #     f.close()
     bs = BeautifulSoup(page_resource, "html.parser")
     page_ul = bs.find_all("ul", attrs={"class": "page-numbers"})
     # remove prev page and next page
     for k, v in enumerate(page_ul[0]('a', attrs={"class": "next"})):
         v.extract()
     page_a = page_ul[0].find_all("a")
     page_span = page_ul[0].find("span")
     page_list = ""
     for k, v in enumerate(page_a):
         if k == 0:
             page_list = page_list + str(v.get_text()).strip()
         else:
             page_list = page_list + "," + str(v.get_text()).strip()
     page_list = page_list + "," + page_span.get_text().strip()
     page_list = {"page_list": page_list}
     # update to mysql
     update_arr = {
         "table": "type",
         "set": {
             "page_num": json.dumps(page_list)
         },
         "condition": ['id={id}'.format(id=recipe_category_id)]
     }
     result = self.db.update(update_arr, is_close_db=False)
     if result == 1:
         debug("id为{id}的菜谱类型页面数据抓取成功".format(id=recipe_category_id))
     else:
         debug("id为{id}的菜谱类型页面数据抓取失败".format(id=recipe_category_id))
示例#19
0
 def getUrl(self):
     start_date = self.ws_db.select({"table": "ws_docid_record"},
                                    is_close_db=False)
     start_date = start_date[0]['start_date']
     start_date = getTimeStamp(start_date, "%Y-%m-%d")
     end_date = start_date + 432000
     end_date = getDateTime(end_date, "%Y-%m-%d")
     court_name = "最高人民法院"
     try:
         guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0")
         guid = json.loads(guid)
         guid = guid['guid']
         number = ""
     except Exception as e:
         debug(e, True)
         debug("guid获取出错")
         return self.getUrl()
     url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s&conditions=searchWord++CPRQ++%s%%20TO%%20%s" % (
         number, guid, urllib.parse.quote(
             str(court_name)), urllib.parse.quote(
                 "法院名称:%s" % court_name), urllib.parse.quote(
                     str(start_date)), urllib.parse.quote(str(end_date)))
     return url
示例#20
0
 def getProxyIp(self):
     """
     进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用
     :return:
     """
     mylock.acquire()
     url = self.config['proxy_ip_url']
     try:
         data = curlData(url, timeout=5)
     except:
         data = ""
     try:
         data = json.loads(data)
         if str(data['ERRORCODE']) == "10032":
             debug("proxy ip 今日提取量已达上限,结束线程", True)
             global stop_thread
             global status
             mylock.release()
             status = 2
             stop_thread = True
             return 0
         for k, v in enumerate(data['RESULT']):
             v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE"
             v['table'] = "proxy_ip"
             v['time_stamp'] = str(getNowTimeStamp())
             tmpK = 1 + k
             try:
                 self.ws_db.insert(v, is_close_db=False)
                 debug("第" + str(tmpK) + "条ip插入成功")
             except Exception as e:
                 debug("第" + str(tmpK) + "条ip插入失败")
             sleep(1)
     except Exception as e:
         debug("proxy ip 获取出错,睡眠5秒")
         sleep(5)
     mylock.release()
示例#21
0
文件: test.py 项目: guaidashu/getLaw
def getConstitutionData(url):
    # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数  flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到
    flag = False
    # url = "http://210.82.32.100:8081/FLFG/flfgByID.action"
    # get = dict()
    # get['flfgID'] = flfgID
    # get['showDetailType'] = showDetailType
    # get['zlsxid'] = zlsxid
    # get['keyword'] = ""
    # get = urlencode(get)
    # url = url + "?" + get
    while True:
        try:
            data = curlData(url=url, value=url)
            break
        except:
            pass
    try:
        data = data.decode("utf-8")
    except:
        pass
    # with open("constitution.txt", "wb") as f:
    #     f.write(data.encode("utf-8"))
    #     f.close()
    # with open("constitution.txt", "rb") as f:
    #     data = f.read().decode("utf-8")
    #     f.close()
    handleDataAll = BeautifulSoup(data, "html.parser")
    handleData = handleDataAll.find_all("table")
    columns_list = [
        'type', "department_type", 'office', 'reference_num', 'issue_date',
        'execute_date', 'timeliness'
    ]
    columns_name_list = [
        '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:'
    ]
    # 获取头部基本信息
    try:
        table_data = handleData[0].find_all("td")
    except:
        table_data = "数据获取出错"
        flag = True
    type_data = dict()
    type_data['url'] = url
    for k, v in enumerate(table_data):
        try:
            if (k + 1) % 2 == 1:
                type_data[columns_list[columns_name_list.index(
                    table_data[k].getText().strip())]] = table_data[
                        k + 1].getText().strip()
        except:
            type_data[columns_list[columns_name_list.index(
                table_data[k].getText().strip())]] = "数据获取出错"
    # 接下来获取标题和内容
    try:
        type_data['title'] = handleDataAll.find_all(
            "div", attrs={"class": "bt"})[0].getText().strip()
    except:
        type_data['title'] = "标题获取出错"
        flag = True
    # 进行内容获取
    try:
        type_data['content'] = str(
            handleDataAll.find_all("div", attrs={"id": "content"})[0])
    except:
        flag = True
    type_data['province'] = ""
    if flag:
        type_data['is_get_error'] = 1
    else:
        type_data['is_get_error'] = 0
    return json.dumps(type_data)
示例#22
0
文件: test.py 项目: guaidashu/getLaw
            type_data[columns_list[columns_name_list.index(
                table_data[k].getText().strip())]] = "数据获取出错"
    # 接下来获取标题和内容
    try:
        type_data['title'] = handleDataAll.find_all(
            "div", attrs={"class": "bt"})[0].getText().strip()
    except:
        type_data['title'] = "标题获取出错"
        flag = True
    # 进行内容获取
    try:
        type_data['content'] = str(
            handleDataAll.find_all("div", attrs={"id": "content"})[0])
    except:
        flag = True
    type_data['province'] = ""
    if flag:
        type_data['is_get_error'] = 1
    else:
        type_data['is_get_error'] = 0
    return json.dumps(type_data)


if __name__ == "__main__":
    url = "http://law.npc.gov.cn:8081/FLFG/flfgByID.action?flfgID=37416210&keyword=&showDetailType=QW&zlsxid=01"
    data = curlData("http://api.tan90.club/ws_api/createParam.html?url=" +
                    urllib.parse.quote(url))
    # data = curlData("http://127.0.0.1:8000/ws_api/createParam.html?url=" + urllib.parse.quote(url))
    # data = curlData(url, referer=url)
    debug(data)
示例#23
0
 def getYear(self, court_name):
     """
     拼装url同时得到cookie
     :param court_name:
     :return: 所有有数据的年份
     """
     # 获取guid
     header_1 = {
         "User-Agent": getUserAgent(index=self.user_agent_index),
         "Origin": "http://wenshu.court.gov.cn"
     }
     try:
         guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0")
         guid = json.loads(guid)
         guid = guid['guid']
         # number = ""
         num_flag = 0
         try:
             number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                               referer="http://wenshu.court.gov.cn",
                               proxy_ip=self.proxy_ip, timeout=5)
         except:
             number = "remind"
         while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1:
             if num_flag > 4:
                 self.setProxyIp()
                 num_flag = 0
             else:
                 num_flag = num_flag + 1
             debug("number获取出错,继续获取")
             try:
                 number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
                                   referer="http://wenshu.court.gov.cn",
                                   proxy_ip=self.proxy_ip, timeout=5)
             except Exception as e:
                 debug(e)
             sleep(0.5)
     except Exception as e:
         debug("guid获取出错")
         return self.getYear(court_name)
     # 拼装url
     url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s" % (
         number, guid, urllib.parse.quote(str(court_name)), urllib.parse.quote("法院名称:%s" % court_name))
     cookie = self.getCookie(url)
     while cookie == 0:
         self.setProxyIp()
         cookie = self.getCookie(url)
     try:
         vjkl5 = cookie['vjkl5']
     except:
         debug("vjkl5获取失败,重新获取")
         if self.is_change_proxy > 4:
             self.setProxyIp()
             self.is_change_proxy = 0
         else:
             self.is_change_proxy = self.is_change_proxy + 1
         return self.getYear(court_name)
     try:
         post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5)
     except:
         post = "{}"
         debug("post参数获取出错")
     post = json.loads(post)
     post['number'] = number
     post['guid'] = guid
     header = {
         "User-Agent": getUserAgent(index=self.user_agent_index),
         "Origin": "http://wenshu.court.gov.cn"
     }
     post['Param'] = "法院名称:%s" % court_name
     year = curlData("http://wenshu.court.gov.cn/List/TreeContent", post, url, cookie, header, self.proxy_ip,
                     timeout=5)
     try:
         year = json.loads(year)
         year = json.loads(year)
     except:
         pass
     return year[4]['Child']
示例#24
0
 def get_page_resource(cls, url):
     data = curlData(url, open_virtual_ip=True)
     return data
示例#25
0
 def getUrlAndCookieCaseType(self, court_name, start_date, end_date, case_type):
     """
     拼装url同时得到cookie
     :param court_name:
     :param start_date:
     :param end_date:
     :param case_type
     :return: param 包含线程所需所有参数 (dict)
     """
     # 获取guid
     header_1 = {
         "User-Agent": getUserAgent(index=self.user_agent_index),
         "Origin": "http://wenshu.court.gov.cn"
     }
     try:
         num_flag = 0
         guid = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=0")
         guid = json.loads(guid)
         guid = guid['guid']
         number = ""
         # try:
         #     number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
         #                       referer="http://wenshu.court.gov.cn",
         #                       proxy_ip=self.proxy_ip, timeout=5)
         # except:
         #     number = "remind"
         # while number.find("remind") != -1 or number.find("html") != -1 or number.find("服务不可用") != -1:
         #     if num_flag > 4:
         #         self.setProxyIp()
         #         num_flag = 0
         #     else:
         #         num_flag = num_flag + 1
         #     debug("number获取出错,继续获取")
         #     try:
         #         number = curlData("http://wenshu.court.gov.cn/ValiCode/GetCode", {"guid": guid}, header=header_1,
         #                           referer="http://wenshu.court.gov.cn",
         #                           proxy_ip=self.proxy_ip, timeout=5)
         #     except Exception as e:
         #         debug(e)
         #     sleep(0.5)
     except:
         debug("guid获取出错")
         return self.getUrlAndCookieCaseType(court_name, start_date, end_date, case_type)
     # 拼装url
     url = "http://wenshu.court.gov.cn/list/list/?sorttype=1&number=%s&guid=%s&conditions=searchWord+%s+SLFY++%s&conditions=searchWord++CPRQ++%s%%20TO%%20%s&conditions=searchWord+%s+AJLX++%s" % (
         number, guid, urllib.parse.quote(str(court_name)), urllib.parse.quote("法院名称:%s" % court_name),
         urllib.parse.quote(str(start_date)), urllib.parse.quote(str(end_date)),
         str(self.getCaseTypeIndex(case_type)), urllib.parse.quote(str("案件类型:%s" % case_type)))
     cookie = self.getCookie(url)
     while cookie == 0:
         self.setProxyIp()
         cookie = self.getCookie(url)
     try:
         vjkl5 = cookie['vjkl5']
     except:
         debug("vjk5获取失败,重新获取")
         if self.is_change_proxy > 4:
             self.setProxyIp()
             self.is_change_proxy = 0
         else:
             self.is_change_proxy = self.is_change_proxy + 1
         return self.getUrlAndCookieCaseType(court_name, start_date, end_date, case_type)
     try:
         post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5)
     except:
         post = "{}"
         debug("post参数获取出错")
     post = json.loads(post)
     post['Order'] = "法院层级"
     post['Page'] = 20
     post['number'] = "wens"
     post['Direction'] = "asc"
     header = {
         "User-Agent": getUserAgent(index=self.user_agent_index),
         "Origin": "http://wenshu.court.gov.cn"
     }
     param = {
         "vjkl5": vjkl5,
         "post": post,
         "header": header,
         "cookie_all": cookie,
         "url": url,
         "court_name": court_name,
         "start_date": start_date,
         "end_date": end_date
     }
     param['post']['Param'] = "法院名称:%s,裁判日期:%s TO %s,案件类型:%s" % (court_name, start_date, end_date, case_type)
     return param
示例#26
0
 def decrypt(self, result_data):
     """
     :param result_data:
     :return:
     """
     try:
         result_data = json.loads(result_data.replace("\n", "\/n"))
         result_data = json.loads(result_data)
     except Exception as e:
         if str(result_data).find("remind") != -1:
             debug("接口数据返回出错,返回数据为:" + str(result_data), True)
         result_data = list()
     post = dict()
     try:
         runEval = result_data[0]['RunEval']
         # 本条件总共文书条数
         try:
             count = int(result_data[0]['Count'])
         except:
             debug(result_data, True)
             return 2
     except Exception as e:
         debug(result_data, True)
         debug(e, True)
         debug("RunEval获取出错", True)
         return 2
     length = len(result_data)
     docId = ""
     insertList = list()
     for i in range(1, length):
         insertArr = dict()
         try:
             try:
                 insertArr['title'] = result_data[i]['案件名称']
             except:
                 insertArr['title'] = ""
             try:
                 insertArr['case_type'] = result_data[i]['案件类型']
             except:
                 insertArr['case_type'] = ""
             try:
                 insertArr['cp_date'] = result_data[i]['裁判日期']
             except:
                 insertArr['cp_date'] = ""
             try:
                 insertArr['court_name'] = result_data[i]['法院名称']
             except:
                 insertArr['court_name'] = ""
             try:
                 insertArr['case_num'] = result_data[i]['案号']
             except:
                 insertArr['case_num'] = ""
             try:
                 insertArr['content'] = result_data[i]['裁判要旨段原文']
             except:
                 insertArr['content'] = ""
             insertList.append(insertArr)
         except:
             pass
         try:
             if i >= length - 1:
                 docId = docId + result_data[i]['文书ID']
             else:
                 docId = docId + result_data[i]['文书ID'] + ","
         except Exception as e:
             debug("文书ID拼接出错,错误未知,暂时判断为无此条目 :" + e.__str__(), True)
             return 2
     if docId.strip() == "" and count != 0:
         return 4
     try:
         post['runEval'] = runEval
         post['docId'] = docId
         result_data = curlData(self.config['get_docid_api_url'], value=post)
         result_data = json.loads(result_data)
         result_data = result_data['data'].split(",")
         # return result_data
     except Exception as e:
         debug(e, True)
         return 2
     for k, v in enumerate(result_data):
         insertList[k]['table'] = "ws_docid"
         insertList[k]['docid'] = v
     return insertList
示例#27
0
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)

from tool import phoenix_db
from tool.function import curlData, debug

if __name__ == "__main__":
    ws_db = phoenix_db.DBConfig()
    start = 153806
    table_columns = ws_db.getColumns({"table": "ws_origin_content"})
    url = "http://api.tan90.club/ws_api/getWsOriginContent.html?start=" + str(start)
    # url = "http://127.0.0.1:8000/ws_api/getWsOriginContent.html?start=" + str(start)
    while True:
        try:
            debug("开始获取", True)
            data = curlData(url=url, timeout=20)
            data = json.loads(data)
            data = data['data']
            break
        except Exception as e:
            debug(e, True)
    # noinspection PyBroadException
    try:
        curlData("http://autostart.tan90.club/", timeout=5)
    except:
        pass
    while True:
        try:
            for k, v in enumerate(data):
                v['table'] = "ws_origin_content"
                tmp = v['id']
示例#28
0
 def getData(self):
     """
     :return: a json
     """
     self.resetProxyIp()
     numAndGuid = self.getNumAndGuid()
     url = "http://wenshu.court.gov.cn/list/list/?sorttype=1" + "&number=" + numAndGuid['number'] + "&guid=" + \
           numAndGuid['guid']
     header = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
     }
     cookie_get_num = 0
     while True:
         try:
             cookie = getCookie(url, header=header, referer="http://wenshu.court.gov.cn", proxy_ip=self.proxy_ip,
                                timeout=5)
             break
         except:
             if cookie_get_num > 1:
                 self.resetProxyIp()
                 cookie_get_num = 0
             cookie_get_num = cookie_get_num + 1
             debug("cookie获取失败,继续获取", True)
     # noinspection PyBroadException
     try:
         vjkl5 = cookie['vjkl5']
     except Exception as e:
         debug("vjk5获取失败,终止", True)
         return self.getData()
     try:
         post = curlData("http://ws_api.xiezhi.sc.cn/getParam?vjkl5=" + vjkl5)
     except Exception as e:
         post = "{}"
         debug("post参数获取出错", True)
     post = json.loads(post)
     post['Order'] = "法院层级"
     post['Page'] = 10
     post['number'] = numAndGuid['number']
     post['Direction'] = "asc"
     header = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
         "Origin": "http://wenshu.court.gov.cn"
     }
     page = "1"
     param = {
         "vjkl5": vjkl5,
         "post": post,
         "header": header,
         "cookie_all": cookie,
         "url": url
     }
     param['post']['Param'] = self.param
     # 抓取数据
     is_get_new_proxy_ip = 0
     while True:
         try:
             param['post']['Index'] = page
             resultData = curlData("http://wenshu.court.gov.cn/List/ListContent", value=param['post'],
                                   referer=param['url'],
                                   cookie=param['cookie_all'], header=param['header'],
                                   proxy_ip=self.proxy_ip, timeout=5)
             break
         except Exception as e:
             if is_get_new_proxy_ip > 0:
                 debug("重新获取代理ip", True)
                 self.resetProxyIp()
                 is_get_new_proxy_ip = 0
             else:
                 debug("第" + str(page) + "页列表获取出错,尝试重新获取,尝试次数:" + str(is_get_new_proxy_ip), True)
                 is_get_new_proxy_ip = is_get_new_proxy_ip + 1
             debug(e, True)
             if e.__str__().find("latin-1") != -1:
                 debug(param, True)
     resultData = self.decrypt(resultData)
     return resultData
示例#29
0
def getConstitutionData(flfgID, zlsxid, province):
    # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数  flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到
    flag = False
    url = "http://law.npc.gov.cn:8081/FLFG/flfgByID.action"
    get = dict()
    get['flfgID'] = flfgID
    get['zlsxid'] = zlsxid
    get['keyword'] = ""
    get = urlencode(get)
    url = url + "?" + get
    data = curlData(url, get, url)
    try:
        data = data.decode("utf-8")
    except:
        pass
    # with open("constitution.txt", "wb") as f:
    #     f.write(data.encode("utf-8"))
    #     f.close()
    # with open("constitution.txt", "rb") as f:
    #     data = f.read().decode("utf-8")
    #     f.close()
    handleDataAll = BeautifulSoup(data, "html.parser")
    handleData = handleDataAll.find_all("table")
    columns_list = [
        'type', "department_type", 'office', 'reference_num', 'issue_date',
        'execute_date', 'timeliness'
    ]
    columns_name_list = [
        '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:'
    ]
    # 获取头部基本信息
    try:
        table_data = handleData[0].find_all("td")
    except:
        table_data = "数据获取出错"
        flag = True
    type_data = dict()
    type_data['url'] = url
    for k, v in enumerate(table_data):
        try:
            if (k + 1) % 2 == 1:
                type_data[columns_list[columns_name_list.index(
                    table_data[k].getText().strip())]] = table_data[
                        k + 1].getText().strip()
        except:
            type_data[columns_list[columns_name_list.index(
                table_data[k].getText().strip())]] = "数据获取出错"
    # 接下来获取标题和内容
    try:
        type_data['title'] = handleDataAll.find_all(
            "div", attrs={"class": "bt"})[0].getText().strip()
    except:
        type_data['title'] = "标题获取出错"
        flag = True
    # 进行内容获取
    try:
        type_data['content'] = str(
            handleDataAll.find_all("div", attrs={"id": "content"})[0])
    except:
        flag = True
    type_data['province'] = province
    if flag:
        type_data['is_get_error'] = 1
    else:
        type_data['is_get_error'] = 0
    DB = DBConfig()
    sql = DB.getInsertSql(type_data, "constitutions")
    result = DB.insert(sql)
    return result
示例#30
0
def analysis(result_data):
    try:
        result_data = json.loads(result_data.replace("\n", "\/n"))
    except Exception as e:
        result_data = ""
    post = dict()
    try:
        runEval = result_data[0]['RunEval']
        # 本条件总共文书条数
        try:
            count = int(result_data[0]['Count'])
            # if count > 210:
            #     return 5
            # 如果条数为0,自然直接返回
            if count == 0:
                debug(result_data)
                return 3
        except:
            debug(result_data)
            return 2
    except Exception as e:
        debug("RunEval获取出错", True)
        return 2
    length = len(result_data)
    docId = ""
    insertList = list()
    for i in range(1, length):
        insertArr = dict()
        insertArr['table'] = "ws_docid"
        insertArr['id'] = "NEXT VALUE FOR LAW_WS_DOCID_SEQUENCE"
        try:
            try:
                insertArr['title'] = result_data[i]['案件名称']
            except:
                insertArr['title'] = ""
            try:
                insertArr['case_type'] = result_data[i]['案件类型']
            except:
                insertArr['case_type'] = ""
            try:
                insertArr['cp_date'] = result_data[i]['裁判日期']
            except:
                insertArr['cp_date'] = ""
            try:
                insertArr['court_name'] = result_data[i]['法院名称']
            except:
                insertArr['court_name'] = ""
            try:
                insertArr['case_num'] = result_data[i]['案号']
            except:
                insertArr['case_num'] = ""
            try:
                insertArr['content'] = result_data[i]['裁判要旨段原文']
            except:
                insertArr['content'] = ""
            insertList.append(insertArr)
        except:
            pass
        try:
            if i >= length - 1:
                docId = docId + result_data[i]['文书ID']
            else:
                docId = docId + result_data[i]['文书ID'] + ","
        except Exception as e:
            debug("文书ID拼接出错,错误未知,暂时判断为无此条目 :" + e.__str__(), True)
            return 2
    if docId.strip() == "" and count != 0:
        return 4
    try:
        post['runEval'] = runEval
        post['docId'] = docId
        result_data = curlData("http://www.wsapi.com/getDocId", value=post)
        result_data = json.loads(result_data)
        result_data = result_data['data'].split(",")
    except Exception as e:
        debug(e, True)
        return 2
    return result_data