예제 #1
0
    def get_wkb_search_res(self, k: str, default_sort_value: int=None) -> dict:
        """
        网课帮搜题
        :param k:
        :return:
        """
        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,)
        headers.update({
            'Proxy-Connection': 'keep-alive',
            'Origin': 'http://wangkebang.cn',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Referer': 'http://wangkebang.cn/m/',
        })

        data = {
            'w': k,
        }
        body = Requests.get_url_body(
            method='post',
            url='http://wangkebang.cn/m/',
            headers=headers,
            # cookies=cookies,
            data=data,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=self.req_num_retries,
            timeout=self.req_timeout,)
        assert body != ''
        # self.lg.info(body)

        # 只返回一个答案
        question_item_sel = {
            'method': 'css',
            'selector': 'div.layui-card-body span',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []

        question_desc_div_sel = {
            'method': 'css',
            'selector': 'span strong',
        }
        answer_div_sel = {
            'method': 'css',
            'selector': 'span strong',
        }

        # 存储返回一个答案的问题和结果
        one_res = {}
        for index, item in enumerate(question_item):
            if index == 0:
                try:
                    question_desc_div = parse_field(
                        parser=question_desc_div_sel,
                        target_obj=item,
                        logger=self.lg,
                    )
                    assert question_desc_div != ''
                    # 清洗
                    question_desc = fix_text(wash_sensitive_info(
                        data=question_desc_div,
                        replace_str_list=[],
                        add_sensitive_str_list=[
                            '<span .*?>',
                            '</span>',
                            '<strong>',
                            '</strong>',
                            '题目\:',
                        ],
                        is_default_filter=False,
                        is_lower=False,
                    ))
                except Exception:
                    continue

                one_res['question_desc'] = question_desc

            elif index == 1:
                try:
                    answer_div = parse_field(
                        parser=answer_div_sel,
                        target_obj=item,
                        logger=self.lg,
                    )
                    assert answer_div != ''
                    # 清洗
                    answer = fix_text(wash_sensitive_info(
                        data=answer_div,
                        replace_str_list=[],
                        add_sensitive_str_list=[
                            '<span .*?>',
                            '</span>',
                            '<strong>',
                            '</strong>',
                            '答案\:',
                        ],
                        is_default_filter=False,
                        is_lower=False,
                    ))
                except Exception:
                    continue

                one_res['answer'] = answer

            else:
                continue

        res = []
        ask_questions_result_item = AskQuestionsResultItem()
        ask_questions_result_item['question_desc'] = one_res['question_desc']
        ask_questions_result_item['answer'] = one_res['answer']
        res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] wkb, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,           # 用于单个结果的排序
            'res': res,
        }
예제 #2
0
    def get_finer_search_res(self, k: str, page_num: int) -> dict:
        """
        凡尔搜题
        :param k: 关键字
        :param page_num: 0开始
        :return:
        """
        headers = get_random_headers(cache_control='')
        headers.update({
            # 'Referer': 'https://www.finerit.com/tiku/search/?q=%E7%A4%BE%E4%BC%9A%E4%B8%BB%E4%B9%89&p=0',
            'Referer': 'https://www.finerit.com/',
        })
        params = (
            ('q', k),
            ('p', str(page_num)),
            # ('s_type', 'erya'),
        )
        # todo 他们网站也许也有人在用, 偶尔会无响应
        body = Requests.get_url_body(
            url='https://www.finerit.com/tiku/search/',
            headers=headers,
            params=params,
            # cookies=cookies,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=self.req_num_retries,
            timeout=self.req_timeout,         #  测试发现10s速度较快, 且成功率可以
        )
        assert body != ''
        # self.lg.info(body)

        question_item_sel = {
            'method': 'css',
            'selector': 'div.resultItem',
        }
        question_desc_div_sel = {
            'method': 'css',
            'selector': 'div.itemHead a',
        }
        answer_div_sel = {
            'method': 'css',
            'selector': 'div.itemBody',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<div class=\"itemHead\">',
                        '</div>',
                        '<a .*?>',
                        '</a>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<div class=\"itemBody\">',
                        '</div>',
                        '<p .*?>',
                        '</p>',
                        '答案:',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] k: {}, page_num: {}'.format(
            '+' if res != [] else '-',
            k,
            page_num,
        ))

        return {
            'k': k,
            'page_num': page_num,
            'res': res,
        }
예제 #3
0
    def get_lyoo_search_res(self, k: str, default_sort_value: int=None) -> dict:
        """
        大学僧搜题
        :param k:
        :return:
        """
        headers = get_random_headers(
            connection_status_keep_alive=False,
        )
        headers.update({
            'Proxy-Connection': 'keep-alive',
            'Origin': 'http://souti.lyoo.xyz',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Referer': 'http://souti.lyoo.xyz/',
        })
        data = {
            'w': k,
        }
        body = Requests.get_url_body(
            method='post',
            url='http://souti.lyoo.xyz/',
            headers=headers,
            data=data,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=self.req_num_retries,
            timeout=self.req_timeout)
        assert body != ''
        # self.lg.info(body)
        
        # 他也是基于凡尔搜题, 但是现在凡尔搜题接口不通, 就获取备用题库
        question_desc_div_sel = {
            'method': 're',
            'selector': '<br /> 问题:(.*?)答案:',
        }
        answer_div_sel = {
            'method': 're',
            'selector': '答案:(.*?)</a></span>',
        }
        question_desc_div = parse_field(
            parser=question_desc_div_sel,
            target_obj=body,
            logger=self.lg,
        )
        assert question_desc_div != ''
        answer_div = parse_field(
            parser=answer_div_sel,
            target_obj=body,
            logger=self.lg,
        )
        assert answer_div != ''

        res = []
        ask_questions_result_item = AskQuestionsResultItem()
        ask_questions_result_item['question_desc'] = question_desc_div
        ask_questions_result_item['answer'] = answer_div
        res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] lyoo, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,           # 用于单个结果的排序
            'res': res,
        }
예제 #4
0
    def get_xms_search_res(self, k: str, default_sort_value: int=None) -> dict:
        """
        小马搜题
        :param k:
        :return:
        """
        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='')
        headers.update({
            'Proxy-Connection': 'keep-alive',
            'Referer': 'http://so.xiaomasou.com/static/index.html',
        })
        params = (
            ('question', k),
        )
        body = Requests.get_url_body(
            url='http://so.xiaomasou.com/api/question',
            headers=headers,
            params=params,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.req_num_retries,
            proxy_type=PROXY_TYPE_HTTPS,
            timeout=self.req_timeout)
        assert body != ''
        # self.lg.info(body)

        data = json_2_dict(
            json_str=body,
            default_res={},
            logger=self.lg,).get('data', {}).get('qaList', [])
        assert data != []
        # pprint(data)

        res = []
        for item in data:
            try:
                question = item.get('q', '')
                assert question != ''
                answer = item.get('a', '')
                assert answer != ''
            except AssertionError:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] xms, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,  # 用于单个结果的排序
            'res': res,
        }
예제 #5
0
    def get_stg_search_res(self, k: str, page_num: int, default_sort_value: int=None) -> dict:
        """
        搜题狗
        :param k:
        :return:
        """
        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,)
        headers.update({
            'Proxy-Connection': 'keep-alive',
            # 'Referer': 'http://www.etkz.cn/search.php?mod=forum&searchid=3&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw={}'.format(k),
        })
        params = [
            ('mod', 'forum'),
            ('searchid', '3'),          # searchid是无规律的, 现在还暂时无解, k 变动, searchid也变动才能搜索
            ('orderby', 'lastpost'),
            ('ascdesc', 'desc'),
            ('searchsubmit', 'yes'),
            ('kw', k),
            ('mobile', '2'),
        ]
        if page_num > 1:
            params.append(('page', page_num),)
        else:
            pass

        body = Requests.get_url_body(
            url='http://www.etkz.cn/search.php',
            headers=headers,
            params=params,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.req_num_retries,
            proxy_type=PROXY_TYPE_HTTPS,
            timeout=self.req_timeout,
        )
        assert body != ''
        # self.lg.info(body)

        question_item_sel = {
            'method': 'css',
            'selector': 'div#threadlist ul li',
        }
        question_desc_div_sel = {
            'method': 're',
            'selector': '问题:(.*?)答案:',
        }
        answer_div_sel = {
            'method': 're',
            'selector': '答案:(.*?)更多相关问题',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []
        # pprint(question_item)

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] k: {}, page_num: {}'.format(
            '+' if res != [] else '-',
            k,
            page_num,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,
            'res': res,
        }
예제 #6
0
    def get_stg_search_res2(self, k: str, default_sort_value: int=None) -> dict:
        """
        搜题狗2(driver 版)
        :param k:
        :return:
        """
        # 只获取第一页数据
        k = '社会主义核心'
        driver = BaseDriver(
            executable_path=PHANTOMJS_DRIVER_PATH,
            load_images=False,
            logger=self.lg,
            user_agent_type=PHONE,
            ip_pool_type=self.ip_pool_type,
        )
        # 输入框选择器
        input_css_sel = 'input#scform_srchtxt'
        submit_btn_sel = 'button#scform_submit'
        body = driver.get_url_body(
            url='http://www.etkz.cn/search.php?mod=forum',
            css_selector=submit_btn_sel,
            timeout=20,)
        assert body != ''
        # self.lg.info(body)
        driver.find_element(value=input_css_sel).send_keys(k)
        driver.find_element(value=submit_btn_sel).click()
        sleep(5.)
        body = Requests._wash_html(driver.page_source)
        assert body != ''
        self.lg.info(body)

        try:
            del driver
        except:
            pass

        question_item_sel = {
            'method': 'css',
            'selector': 'div#threadlist ul li',
        }
        question_desc_div_sel = {
            'method': 're',
            'selector': '问题:(.*?)答案:',
        }
        answer_div_sel = {
            'method': 're',
            'selector': '答案:(.*?)更多相关问题',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []
        # pprint(question_item)

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] stg2, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,
            'res': res,
        }
예제 #7
0
    def get_xmtk_search_res(self, k: str, default_sort_value: int=None) -> dict:
        """
        熊猫题库
        :param k:
        :param page_num:
        :param default_sort_value:
        :return:
        """
        # 只能获取第一页数据
        headers = get_random_headers(
            connection_status_keep_alive=False,
            cache_control='',
        )
        headers.update({
            'Referer': 'http://www.lmcv.cn/',
        })
        params = (
            ('s', k),
        )
        body = Requests.get_url_body(
            url='http://www.lmcv.cn/',
            headers=headers,
            params=params,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            # proxy_type=PROXY_TYPE_HTTPS,          # 不支持https, 因此偶尔高并发会无数据
            num_retries=self.req_num_retries,
            timeout=self.req_timeout,)
        assert body != ''
        # self.lg.info(body)

        question_item_sel = {
            'method': 'css',
            'selector': 'span.art-main',
        }
        question_desc_div_sel = {
            'method': 're',
            'selector': '问题:(.*?)答案:',
        }
        answer_div_sel = {
            'method': 're',
            'selector': '答案:(.*?)更多相关问题',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []
        # pprint(question_item)

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] xmtk, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,
            'res': res,
        }