def bank_page_list(page=5):
    """
    想要获取多少页的内容
    http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html
    :param page: int 输入从第 1 页到 all_page 页的内容
    :return: pd.DataFrame 另存为 csv 文件
    """
    big_url_list = []
    big_title_list = []
    flag = True
    cbirc_headers = cbirc_headers_without_cookie_2019.copy()
    for i_page in range(1, page):
        # i_page = 1
        print(i_page)
        main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format(
            i_page)
        if flag:
            res = requests.get(main_url, headers=cbirc_headers)
            cbirc_headers.update(
                {"Cookie": res.headers["Set-Cookie"].split(";")[0]})
            res = requests.get(main_url, headers=cbirc_headers)
            soup = BeautifulSoup(res.text, "lxml")
            url_list = [
                item.find("a")["href"]
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            title_list = [
                item.find("a").get_text()
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            big_url_list.extend(url_list)
            big_title_list.extend(title_list)
            flag = 0
        else:
            res = requests.get(main_url, headers=cbirc_headers)
            soup = BeautifulSoup(res.text, "lxml")
            url_list = [
                item.find("a")["href"]
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            title_list = [
                item.find("a").get_text()
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            big_url_list.extend(url_list)
            big_title_list.extend(title_list)
    temp_df = pd.DataFrame([big_title_list, big_url_list]).T
    return temp_df, cbirc_headers
Exemplo n.º 2
0
def bank_page_list(page=5):
    """
    想要获取多少页的内容
    http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html
    :param page: int 输入从第 1 页到 all_page 页的内容
    :return: pd.DataFrame 另存为 csv 文件
    """
    big_url_list = []
    big_title_list = []
    flag = True
    cbirc_headers = cbirc_headers_without_cookie_2019.copy()
    for i_page in range(1, page):
        # i_page = 1
        print(i_page)
        main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format(
            i_page)
        if flag:
            res = requests.get(main_url, headers=cbirc_headers)
            temp_cookie = res.headers["Set-Cookie"].split(";")[0]
            cbirc_headers.update(
                {"Cookie": res.headers["Set-Cookie"].split(";")[0]})
            res = requests.get(main_url, headers=cbirc_headers)
            soup = BeautifulSoup(res.text, "lxml")
            res_html = ("function getClearance(){" +
                        soup.find_all("script")[0].get_text() + "};")
            res_html = res_html.replace("</script>", "")
            res_html = res_html.replace("eval", "return")
            res_html = res_html.replace("<script>", "")
            ctx = execjs.compile(res_html)
            if "firstChild.cookie" in ctx.call("getClearance"):
                over_js = ("function getClearance2(){var a" +
                           ctx.call("getClearance").split("firstChild.cookie")
                           [1].split("Path=/;'")[0] + "Path=/;';return a;};")
            if "document.cookie" in ctx.call("getClearance"):
                over_js = ("function getClearance2(){var a" +
                           ctx.call("getClearance").split("document.cookie")
                           [1].split("Path=/;'")[0] + "Path=/;';return a;};")
            over_js = over_js.replace("window.headless", "''")
            over_js = over_js.replace("window['_p'+'hantom']", "''")
            over_js = over_js.replace("window['__p'+'hantom'+'as']", "''")
            over_js = over_js.replace("window['callP'+'hantom']", "''")
            over_js = over_js.replace("return(", "eval(")
            ctx = execjs.compile(over_js)
            cookie_2 = ctx.call("getClearance2").split(";")[0]
            cbirc_headers.update({"Cookie": temp_cookie + ";" + cookie_2})
            res = requests.get(main_url, headers=cbirc_headers)
            soup = BeautifulSoup(res.text, "lxml")
            url_list = [
                item.find("a")["href"]
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            title_list = [
                item.find("a").get_text()
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            big_url_list.extend(url_list)
            big_title_list.extend(title_list)
            flag = 0
        else:
            res = requests.get(main_url, headers=cbirc_headers)
            soup = BeautifulSoup(res.text, "lxml")
            url_list = [
                item.find("a")["href"]
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            title_list = [
                item.find("a").get_text()
                for item in soup.find_all(attrs={"class": "zwbg-2"})
            ]
            big_url_list.extend(url_list)
            big_title_list.extend(title_list)
    temp_df = pd.DataFrame([big_title_list, big_url_list]).T
    return temp_df, cbirc_headers