Python request_url示例，InformationGet.InternetConnect.request_url Python示例

示例#1

0

显示文件

文件： GetFrequentQuestion.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

 def get_answer_text(self, turn_page_url):
     try:
         turn_page_source = request_url(turn_page_url)
         turn_page_source.encoding = turn_page_source.apparent_encoding
         turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml")
         answer_text = str(
             turn_page_soup.find("div", class_="question_a").text).strip()
         return answer_text
     except Exception as e:
         self.thread_logger.error("答句%s抓取失败,失败原因%s" % (turn_page_url, e))
         return ""

示例#2

0

显示文件

文件： GetPlanInfo.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_plan_info_ustc():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/中国科学技术大学/招生计划"
    main_url = "https://zsb.ustc.edu.cn"
    # 获取分类信息
    main_page_source = request_url(main_url + "/12993/list.htm")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    for area in main_page_soup.find_all("area"):
        page_url = area["href"]
        page_source = request_url(page_url)
        page_source.encoding = page_source.apparent_encoding
        page_soup = BeautifulSoup(page_source.text, "lxml")
        page_soup.prettify()
        title = page_soup.find("h1", class_="arti_title").string
        year = title[:4]
        district = title[5:-4]
        table_name = year + "-" + district
        table_head = ["专业", "类别", "人数"]
        mylogger.debug(table_name)
        mylogger.debug(str(table_head))
        all_lines = []
        for tr in page_soup.find("div",
                                 class_="wp_articlecontent").find_all("tr"):
            line = []
            for td in tr:
                line.append(td.text)
            all_lines.append(line)
        table_content = []
        for line in all_lines[1:]:
            if line[0] != "合计" and line[0] != "小计":
                if district == "浙江" or district == "上海":
                    table_content.append(
                        [line[0] + "(" + line[1] + ")", "理工", line[2]])
                else:
                    table_content.append([line[0], "理工", line[1]])
        for line in table_content:
            mylogger.debug(str(line))
        write_table(file_path, table_name, table_head, table_content)
        mylogger.info(year + district + "的招生计划已存入文件")

示例#3

0

显示文件

文件： GetFrequentQuestion.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_undergraduate_university_info():
    # 院校库主页
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start="
    main_page_source = request_url(main_url + "0")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    page_count = int(
        main_page_soup.find("li", class_="lip dot").next_sibling.text)
    page_university_count = 20
    university_infos = []
    for i_page in range(page_count):
        page_url = main_url + str(i_page * page_university_count)
        function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count)))
        function_logger.info("页面url%s" % page_url)
        browser = selenium_chrome(page_url)
        page_souce = browser.find_element_by_class_name(
            "ch-table").get_attribute("innerHTML")
        browser.quit()
        page_soup = BeautifulSoup(page_souce, "lxml")
        page_soup.prettify()
        head = [th.text for th in page_soup.find("tr").find_all("th")]
        print(head)
        for tr in page_soup.find_all("tr")[1:]:
            info = {}
            td_list = tr.find_all("td")
            info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find(
                "a")["href"]
            for i in [0, 1, 2, 3, 4, 7]:
                info[head[i]] = td_list[i].text.strip()
            info[head[5]] = td_list[5].text.strip().replace("\n", "").replace(
                " ", "").replace("\u2002", " ")
            info[head[6]] = td_list[6].text.strip().replace(
                "\ue664", "有") if td_list[6].text.strip() != "" else "无"
            university_infos.append(info)
    for info in university_infos:
        print(info)
    with open("Information/大学/university_info", "wb") as p_file:
        pickle.dump(university_infos, p_file)

示例#4

0

显示文件

文件： GetFrequentQuestion.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_consultation_forum_id():
    with open("Information/大学/university_info", "rb") as p_file:
        university_infos = pickle.load(p_file)
    for i_info in range(len(university_infos)):
        info = university_infos[i_info]
        # if "985" not in info["院校特性"] and "211" not in info["院校特性"]:
        #     continue
        print(info)
        try:
            page_source = request_url(info["url"])
            page_source.encoding = page_source.apparent_encoding
            forum_id = \
                BeautifulSoup(page_source.text, "lxml").find("a", class_="ch-btn zx-question")["href"].split("-")[-1][
                :-6]
            print(forum_id)
        except:
            forum_id = ""
        university_infos[i_info]["forum_id"] = forum_id
    with open("Information/大学/university_info", "wb") as p_file:
        pickle.dump(university_infos, p_file)
    for info in university_infos:
        if "985" in info["院校特性"] and "211" in info["院校特性"]:
            print(info)

示例#5

0

显示文件

def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                   ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                   ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                   ["复旦大学", str(7243)], ["南京大学", str(4453)],
                   ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                   ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                    str(53593)]]
    for school in school_urls:
        function_logger.info("开始抓取" + school[0] + "的招生问题数据...")
        # 创建该学校的问题集收集表,sheet,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        with open(file_path + "/" + school[0] + "常用问题集.csv",
                  "w",
                  encoding='utf-8') as csvfile:
            csvfile.truncate()
            writer = csv.writer(csvfile)
            writer.writerow(table_head)
        main_page_source = request_url(
            "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-"
            + school[1] + ",start-0.dhtml")
        main_page_source.encoding = main_page_source.apparent_encoding
        main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
        # 页面总数
        page_count = main_page_soup.find(
            "li", class_="lip dot").next_sibling.a.string
        # 置顶问题个数
        top_question_count = len(
            main_page_soup.find("table", class_="ch-table zx-table").find_all(
                "span", class_="question_top_txt"))
        # 每页问题个数
        page_question_count = 15
        # 通过构造每一个页面url进入具体页面
        for i_page in list(range(10)) + list(range(11, int(page_count))):
            page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[
                1] + ",start-" + str(i_page * page_question_count) + ".dhtml"
            # xls表格记录基点(页问题量+置顶问题量+表头)
            # if i_page == 0:
            #     base_count = 1
            # else:
            #     base_count = i_page * page_question_count + top_question_count + 1
            function_logger.info("页面抓取进度(%d,%d)" %
                                 (i_page + 1, int(page_count)))
            function_logger.info("页面url%s" % page_url)
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            records = []
            # 置顶问答只记录一次
            if i_page == 0:
                start_index = 0
            else:
                start_index = top_question_count * 2
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a", class_="question_t_txt").string).strip()
                function_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip()
                function_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td", class_="question_t ch-table-center").text).strip()
                function_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况，需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    question_text = str(
                        turn_page_soup.find("div",
                                            class_="question").text).strip()
                function_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕！" % school[0])

示例#6

0

显示文件

文件： GetPlanInfo.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_plan_info_xjtu():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/西安交通大学/招生计划"
    # 通过获取单个网页获取信息，需要后续处理，很麻烦
    # mylogger.info("开始获取网页源码...共五个网页")
    # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file:
    #     for i in range(1, 6):
    #         main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \
    #                    + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005"
    #         # 获取分类信息
    #         main_page_source = requests.get(main_url).text
    #         main_page_soup = BeautifulSoup(main_page_source, "lxml")
    #         main_page_soup.prettify()
    #         for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"):
    #             url_file.write(str(item)+"\n")
    # mylogger.info("招生计划页面url获取完成")
    # mylogger.info("开始获取具体页面信息")
    # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file:
    #     url_source = url_file.read()
    # url_soup = BeautifulSoup(url_source,"lxml")
    # url_soup.prettify()
    # for page_url in url_soup.find_all("a"):
    #     print(page_url)
    # 直接从官网进行数据查询，使用form提交
    # 获取可查询的年份和地区
    main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm"
    main_page_source = request_url(main_url)
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    years = []
    districts = []
    for year in main_page_soup.find("select", id="nf").find_all("option"):
        years.append(year.string)
    for district in main_page_soup.find("select",
                                        id="sf").find_all("option")[1:]:
        districts.append(district.string)
    mylogger.debug("可查询的年份" + str(years))
    mylogger.debug("可查询的省份" + str(districts))
    search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168"
    for year in years:
        for district in districts:
            # x,y 是查询按钮点击时的坐标，查询按钮大小x,y(54x22)
            params = {"nf": year, "sf": district, "x": "27", "y": "11"}
            return_html = requests.post(search_url, data=params)
            return_soup = BeautifulSoup(return_html.text, "lxml")
            return_soup.prettify()
            all_lines = []
            for tr in return_soup.find("div", id="fybt").find_all("tr"):
                line = []
                for td in tr:
                    if td.string != "\n":
                        line.append(str(td.string).strip())
                all_lines.append(line)
            table_name = year + "-" + district[:-1]
            table_head = ["专业", "类别", "人数"]
            table_content = []
            for line in all_lines[1:-1]:
                classy = line[2]
                if classy == "理":
                    classy = "理工"
                if classy == "文":
                    classy = "文史"
                table_content.append([line[0], classy, line[4]])
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + district + "的招生计划已存入文件")

示例#7

0

显示文件

文件： GetPlanInfo.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_plan_info_fudan():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path_benbu = "Information/九校联盟/复旦大学/招生计划"
    file_path_yixue = "Information/九校联盟/复旦大学上海医学部/招生计划"
    # 直接从官网进行数据查询，使用form提交
    # 获取可查询的年份和地区
    main_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.html"
    main_page_source = request_url(main_url)
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    years = []
    districts = []
    for year in main_page_soup.find("select", id="nf").find_all("option"):
        years.append(year.string)
    for district in main_page_soup.find("select", id="ss").find_all("option"):
        districts.append(district.string)
    mylogger.debug("可查询的年份" + str(years))
    mylogger.debug("可查询的省份" + str(districts))
    search_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.action"
    # 2006-2015年有数据
    for year in years:
        for district in districts:
            params = {"lb": "plan", "nf": year, "ss": district}
            return_html = requests.post(search_url, data=params)
            return_soup = BeautifulSoup(return_html.text, "lxml")
            return_soup.prettify()
            all_lines = []
            for div in return_soup.find_all("div",
                                            class_="inquirytable_result"):
                for tr in div.find_all("tr"):
                    line = []
                    for td in tr:
                        if td.string != "\n":
                            line.append(str(td.string).strip())
                    all_lines.append(line)
            table_name = year + "-" + district
            table_head = ["专业", "类别", "人数"]
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            # 数据查询为空
            if len(all_lines) < 3:
                continue
            # 开始提取数据
            table_content_benbu = []
            table_content_yixue = []
            # 2013年开始复旦大学与复旦大学上海医学部分开招生
            if int(year) < 2013:
                for line in all_lines[1:-1]:
                    # 去除文史汇总和理工汇总
                    if line[0] == "文史汇总" or line[0] == "理工汇总":
                        continue
                    # 上海地区表头有不同
                    if district == "上海":
                        table_content_benbu.append([line[0], line[1], line[5]])
                    else:
                        table_content_benbu.append([line[0], line[1], line[3]])
            else:
                # 先将本部和医学院的数据分开
                index = 0
                for i_line in range(1, len(all_lines)):
                    if all_lines[i_line][0] == "专业名称":
                        index = i_line
                        break
                if index == 0:
                    all_lines_benbu = all_lines
                    all_lines_yixue = []
                else:
                    all_lines_benbu = all_lines[:index]
                    all_lines_yixue = all_lines[index:]
                for line in all_lines_benbu[1:-1]:
                    # 去除文史汇总和理工汇总
                    if line[0] == "文史汇总" or line[0] == "理工汇总":
                        continue
                    # 上海地区表头有不同
                    if district == "上海":
                        table_content_benbu.append([line[0], line[1], line[5]])
                    else:
                        table_content_benbu.append([line[0], line[1], line[3]])
                if len(all_lines_yixue) != 0:
                    for line in all_lines_yixue[1:-1]:
                        # 去除文史汇总和理工汇总
                        if line[0] == "文史汇总" or line[0] == "理工汇总":
                            continue
                        # 上海地区表头有不同
                        if district == "上海":
                            table_content_yixue.append(
                                [line[0], line[1], line[5]])
                        else:
                            table_content_yixue.append(
                                [line[0], line[1], line[3]])
            mylogger.debug("本部招生计划：")
            for line in table_content_benbu:
                mylogger.debug(str(line))
            mylogger.debug("医学院招生计划：")
            for line in table_content_yixue:
                mylogger.debug(str(line))
            write_table(file_path_benbu, table_name, table_head,
                        table_content_benbu)
            mylogger.info("本部" + year + district + "的招生计划已存入文件")
            if len(table_content_yixue) != 0:
                write_table(file_path_yixue, table_name, table_head,
                            table_content_yixue)
                mylogger.info("医学院" + year + district + "的招生计划已存入文件")

示例#8

0

显示文件

文件： GetFrequentQuestion.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    allready_get = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                    ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                    ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                    ["复旦大学", str(7243)], ["南京大学", str(4453)],
                    ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                    ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                     str(53593)]]
    university_formid = []
    with open("Information/大学/university_info", "rb") as p_file:
        university_infos = pickle.load(p_file)
    for info in university_infos:
        if "985" in info["院校特性"] or "211" in info["院校特性"]:
            if info["forum_id"] != "":
                university_formid.append([info["院校名称"], info["forum_id"]])
    function_logger.info("共有%d所985、211大学" % len(university_formid))
    for university in university_formid:
        begin = time.time()
        function_logger.info("开始抓取" + university[0] + "的招生问题数据...")
        main_page_url = "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + university[
            1] + ",start-0.dhtml"
        try:
            main_page_source = request_url(main_page_url)
            main_page_source.encoding = main_page_source.apparent_encoding
            main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
            # 获取页面总数，页面栏含有省略号、不含省略号两种查找方式
            if main_page_soup.find("li", class_="lip dot"):
                page_count = main_page_soup.find(
                    "li", class_="lip dot").next_sibling.a.string
            else:
                page_count = main_page_soup.find(
                    "ul",
                    class_="ch-page clearfix").find_all("li")[-2].a.string
            # 置顶问题个数
            top_question_count = len(
                main_page_soup.find("table",
                                    class_="ch-table zx-table").find_all(
                                        "span", class_="question_top_txt"))
            function_logger.debug("页面总数：%d 置顶问题个数：%d" %
                                  (int(page_count), int(top_question_count)))
        except Exception as e:
            # 招生咨询页面没有数据（三个大学）
            function_logger.error("%s咨询界面没有数据，页面链接为：%s" %
                                  (university[0], main_page_url))
            function_logger.error("错误信息：%s" % e)
            continue
        # 创建该学校的问题集收集表,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        csvfile = open(file_path + "/" + university[0] + "常用问题集.csv",
                       "w",
                       newline="",
                       encoding='utf-8')
        csvfile.truncate()
        writer = csv.writer(csvfile)
        writer.writerow(table_head)
        record_queue = Queue()
        # 每次开启10个线程,进行数据下载和存储
        start_index = 0
        end_index = 10
        while True:
            if start_index > int(page_count):
                break
            else:
                dThread = [
                    DownloadPageInfo(university[1], page_id, int(page_count),
                                     top_question_count, record_queue)
                    for page_id in range(start_index, end_index)
                ]
                sThread = SavePageInfo(record_queue, writer)
                for d in dThread:
                    d.start()
                sThread.start()
                for d in dThread:
                    d.join()
                record_queue.put(-1)
                sThread.join()
                start_index += 10
                end_index += 10
                if end_index > int(page_count):
                    end_index = int(page_count)

        csvfile.close()
        function_logger.info("抓取%s的信息用时：%ds" %
                             (university[0], time.time() - begin))

示例#9

0

显示文件

文件： GetFrequentQuestion.py 项目： zhangyuankai2018/Knowledge-Map-and-Question-Answer

    def get_page_info(self):
        main_url = "https://gaokao.chsi.com.cn"
        page_question_count = 15
        page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + self.university_id + ",start-" + str(
            self.page_id * page_question_count) + ".dhtml"
        self.thread_logger.info("页面抓取进度(%d,%d)" %
                                (self.page_id + 1, self.page_count))
        self.thread_logger.info("页面url %s" % page_url)
        try:
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            # 获取咨询序列（所有的子节点）
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            # 除去其中的空行
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            # 置顶问答只记录一次
            if self.page_id == 0:
                start_index = 0
            else:
                start_index = self.top_question_count * 2
            page_infos = []
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a",
                    class_="question_t_txt").string).strip().replace(",", "，")
                # self.thread_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip().replace(
                        ",", "，")
                # self.thread_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td",
                    class_="question_t ch-table-center").text).strip().replace(
                        ",", "，")
                # self.thread_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况，需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    question_text = self.get_question_text(turn_page_url)
                replace_str = [
                    "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5",
                    "\u3000"
                    "[", "]", " "
                ]
                for r_str in replace_str:
                    question_text = question_text.replace(r_str, "")
                question_text.replace(",", "，")
                # self.thread_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    answer_text = self.get_answer_text(turn_page_url)
                replace_str = [
                    "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5",
                    "\u3000"
                    "[", "]", " "
                ]
                for r_str in replace_str:
                    answer_text = answer_text.replace(r_str, "")
                answer_text.replace(",", "，")
                # self.thread_logger.debug("回答:%s" % answer_text)
                page_infos.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            return page_infos

        except Exception as e:
            self.thread_logger.error("错误信息%s" % e)
            return []