def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    allready_get = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                    ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                    ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                    ["复旦大学", str(7243)], ["南京大学", str(4453)],
                    ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                    ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                     str(53593)]]
    university_formid = []
    with open("Information/大学/university_info", "rb") as p_file:
        university_infos = pickle.load(p_file)
    for info in university_infos:
        if "985" in info["院校特性"] or "211" in info["院校特性"]:
            if info["forum_id"] != "":
                university_formid.append([info["院校名称"], info["forum_id"]])
    function_logger.info("共有%d所985、211大学" % len(university_formid))
    for university in university_formid:
        begin = time.time()
        function_logger.info("开始抓取" + university[0] + "的招生问题数据...")
        main_page_url = "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + university[
            1] + ",start-0.dhtml"
        try:
            main_page_source = request_url(main_page_url)
            main_page_source.encoding = main_page_source.apparent_encoding
            main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
            # 获取页面总数,页面栏含有省略号、不含省略号两种查找方式
            if main_page_soup.find("li", class_="lip dot"):
                page_count = main_page_soup.find(
                    "li", class_="lip dot").next_sibling.a.string
            else:
                page_count = main_page_soup.find(
                    "ul",
                    class_="ch-page clearfix").find_all("li")[-2].a.string
            # 置顶问题个数
            top_question_count = len(
                main_page_soup.find("table",
                                    class_="ch-table zx-table").find_all(
                                        "span", class_="question_top_txt"))
            function_logger.debug("页面总数:%d 置顶问题个数:%d" %
                                  (int(page_count), int(top_question_count)))
        except Exception as e:
            # 招生咨询页面没有数据(三个大学)
            function_logger.error("%s咨询界面没有数据,页面链接为:%s" %
                                  (university[0], main_page_url))
            function_logger.error("错误信息:%s" % e)
            continue
        # 创建该学校的问题集收集表,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        csvfile = open(file_path + "/" + university[0] + "常用问题集.csv",
                       "w",
                       newline="",
                       encoding='utf-8')
        csvfile.truncate()
        writer = csv.writer(csvfile)
        writer.writerow(table_head)
        record_queue = Queue()
        # 每次开启10个线程,进行数据下载和存储
        start_index = 0
        end_index = 10
        while True:
            if start_index > int(page_count):
                break
            else:
                dThread = [
                    DownloadPageInfo(university[1], page_id, int(page_count),
                                     top_question_count, record_queue)
                    for page_id in range(start_index, end_index)
                ]
                sThread = SavePageInfo(record_queue, writer)
                for d in dThread:
                    d.start()
                sThread.start()
                for d in dThread:
                    d.join()
                record_queue.put(-1)
                sThread.join()
                start_index += 10
                end_index += 10
                if end_index > int(page_count):
                    end_index = int(page_count)

        csvfile.close()
        function_logger.info("抓取%s的信息用时:%ds" %
                             (university[0], time.time() - begin))
class DownloadPageInfo(Thread):
    def __init__(self, university_id, page_id, page_count, top_question_count,
                 record_queue):
        Thread.__init__(self)
        self.university_id = university_id
        self.page_id = page_id
        self.page_count = page_count
        self.top_question_count = top_question_count
        self.record_queue = record_queue
        self.thread_logger = MyLog(
            logger="thread" + str(threading.current_thread().ident)).getlog()

    def get_page_info(self):
        main_url = "https://gaokao.chsi.com.cn"
        page_question_count = 15
        page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + self.university_id + ",start-" + str(
            self.page_id * page_question_count) + ".dhtml"
        self.thread_logger.info("页面抓取进度(%d,%d)" %
                                (self.page_id + 1, self.page_count))
        self.thread_logger.info("页面url %s" % page_url)
        try:
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            # 获取咨询序列(所有的子节点)
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            # 除去其中的空行
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            # 置顶问答只记录一次
            if self.page_id == 0:
                start_index = 0
            else:
                start_index = self.top_question_count * 2
            page_infos = []
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a",
                    class_="question_t_txt").string).strip().replace(",", ",")
                # self.thread_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip().replace(
                        ",", ",")
                # self.thread_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td",
                    class_="question_t ch-table-center").text).strip().replace(
                        ",", ",")
                # self.thread_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    question_text = self.get_question_text(turn_page_url)
                replace_str = [
                    "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5",
                    "\u3000"
                    "[", "]", " "
                ]
                for r_str in replace_str:
                    question_text = question_text.replace(r_str, "")
                question_text.replace(",", ",")
                # self.thread_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    answer_text = self.get_answer_text(turn_page_url)
                replace_str = [
                    "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5",
                    "\u3000"
                    "[", "]", " "
                ]
                for r_str in replace_str:
                    answer_text = answer_text.replace(r_str, "")
                answer_text.replace(",", ",")
                # self.thread_logger.debug("回答:%s" % answer_text)
                page_infos.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            return page_infos

        except Exception as e:
            self.thread_logger.error("错误信息%s" % e)
            return []

    def get_question_text(self, turn_page_url):
        try:
            turn_page_source = request_url(turn_page_url)
            turn_page_source.encoding = turn_page_source.apparent_encoding
            turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml")
            question_text = str(
                turn_page_soup.find("div", class_="question").text).strip()
            return question_text
        except Exception as e:
            self.thread_logger.error("问句%s抓取失败,失败原因%s" % (turn_page_url, e))
            return ""

    def get_answer_text(self, turn_page_url):
        try:
            turn_page_source = request_url(turn_page_url)
            turn_page_source.encoding = turn_page_source.apparent_encoding
            turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml")
            answer_text = str(
                turn_page_soup.find("div", class_="question_a").text).strip()
            return answer_text
        except Exception as e:
            self.thread_logger.error("答句%s抓取失败,失败原因%s" % (turn_page_url, e))
            return ""

    def run(self):
        page_record = self.get_page_info()
        if page_record:
            self.record_queue.put(page_record)