def get_wkb_search_res(self, k: str, default_sort_value: int=None) -> dict: """ 网课帮搜题 :param k: :return: """ headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False,) headers.update({ 'Proxy-Connection': 'keep-alive', 'Origin': 'http://wangkebang.cn', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://wangkebang.cn/m/', }) data = { 'w': k, } body = Requests.get_url_body( method='post', url='http://wangkebang.cn/m/', headers=headers, # cookies=cookies, data=data, verify=False, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries, timeout=self.req_timeout,) assert body != '' # self.lg.info(body) # 只返回一个答案 question_item_sel = { 'method': 'css', 'selector': 'div.layui-card-body span', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] question_desc_div_sel = { 'method': 'css', 'selector': 'span strong', } answer_div_sel = { 'method': 'css', 'selector': 'span strong', } # 存储返回一个答案的问题和结果 one_res = {} for index, item in enumerate(question_item): if index == 0: try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<span .*?>', '</span>', '<strong>', '</strong>', '题目\:', ], is_default_filter=False, is_lower=False, )) except Exception: continue one_res['question_desc'] = question_desc elif index == 1: try: answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<span .*?>', '</span>', '<strong>', '</strong>', '答案\:', ], is_default_filter=False, is_lower=False, )) except Exception: continue one_res['answer'] = answer else: continue res = [] ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = one_res['question_desc'] ask_questions_result_item['answer'] = one_res['answer'] res.append(dict(ask_questions_result_item)) self.lg.info('[{}] wkb, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, # 用于单个结果的排序 'res': res, }
def get_finer_search_res(self, k: str, page_num: int) -> dict: """ 凡尔搜题 :param k: 关键字 :param page_num: 0开始 :return: """ headers = get_random_headers(cache_control='') headers.update({ # 'Referer': 'https://www.finerit.com/tiku/search/?q=%E7%A4%BE%E4%BC%9A%E4%B8%BB%E4%B9%89&p=0', 'Referer': 'https://www.finerit.com/', }) params = ( ('q', k), ('p', str(page_num)), # ('s_type', 'erya'), ) # todo 他们网站也许也有人在用, 偶尔会无响应 body = Requests.get_url_body( url='https://www.finerit.com/tiku/search/', headers=headers, params=params, # cookies=cookies, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries, timeout=self.req_timeout, # 测试发现10s速度较快, 且成功率可以 ) assert body != '' # self.lg.info(body) question_item_sel = { 'method': 'css', 'selector': 'div.resultItem', } question_desc_div_sel = { 'method': 'css', 'selector': 'div.itemHead a', } answer_div_sel = { 'method': 'css', 'selector': 'div.itemBody', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<div class=\"itemHead\">', '</div>', '<a .*?>', '</a>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<div class=\"itemBody\">', '</div>', '<p .*?>', '</p>', '答案:', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] k: {}, page_num: {}'.format( '+' if res != [] else '-', k, page_num, )) return { 'k': k, 'page_num': page_num, 'res': res, }
def get_lyoo_search_res(self, k: str, default_sort_value: int=None) -> dict: """ 大学僧搜题 :param k: :return: """ headers = get_random_headers( connection_status_keep_alive=False, ) headers.update({ 'Proxy-Connection': 'keep-alive', 'Origin': 'http://souti.lyoo.xyz', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://souti.lyoo.xyz/', }) data = { 'w': k, } body = Requests.get_url_body( method='post', url='http://souti.lyoo.xyz/', headers=headers, data=data, verify=False, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries, timeout=self.req_timeout) assert body != '' # self.lg.info(body) # 他也是基于凡尔搜题, 但是现在凡尔搜题接口不通, 就获取备用题库 question_desc_div_sel = { 'method': 're', 'selector': '<br /> 问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)</a></span>', } question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=body, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=body, logger=self.lg, ) assert answer_div != '' res = [] ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc_div ask_questions_result_item['answer'] = answer_div res.append(dict(ask_questions_result_item)) self.lg.info('[{}] lyoo, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, # 用于单个结果的排序 'res': res, }
def get_xms_search_res(self, k: str, default_sort_value: int=None) -> dict: """ 小马搜题 :param k: :return: """ headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='') headers.update({ 'Proxy-Connection': 'keep-alive', 'Referer': 'http://so.xiaomasou.com/static/index.html', }) params = ( ('question', k), ) body = Requests.get_url_body( url='http://so.xiaomasou.com/api/question', headers=headers, params=params, verify=False, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS, timeout=self.req_timeout) assert body != '' # self.lg.info(body) data = json_2_dict( json_str=body, default_res={}, logger=self.lg,).get('data', {}).get('qaList', []) assert data != [] # pprint(data) res = [] for item in data: try: question = item.get('q', '') assert question != '' answer = item.get('a', '') assert answer != '' except AssertionError: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] xms, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, # 用于单个结果的排序 'res': res, }
def get_stg_search_res(self, k: str, page_num: int, default_sort_value: int=None) -> dict: """ 搜题狗 :param k: :return: """ headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False,) headers.update({ 'Proxy-Connection': 'keep-alive', # 'Referer': 'http://www.etkz.cn/search.php?mod=forum&searchid=3&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw={}'.format(k), }) params = [ ('mod', 'forum'), ('searchid', '3'), # searchid是无规律的, 现在还暂时无解, k 变动, searchid也变动才能搜索 ('orderby', 'lastpost'), ('ascdesc', 'desc'), ('searchsubmit', 'yes'), ('kw', k), ('mobile', '2'), ] if page_num > 1: params.append(('page', page_num),) else: pass body = Requests.get_url_body( url='http://www.etkz.cn/search.php', headers=headers, params=params, verify=False, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS, timeout=self.req_timeout, ) assert body != '' # self.lg.info(body) question_item_sel = { 'method': 'css', 'selector': 'div#threadlist ul li', } question_desc_div_sel = { 'method': 're', 'selector': '问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)更多相关问题', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] # pprint(question_item) res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] k: {}, page_num: {}'.format( '+' if res != [] else '-', k, page_num, )) return { 'k': k, 'page_num': default_sort_value, 'res': res, }
def get_stg_search_res2(self, k: str, default_sort_value: int=None) -> dict: """ 搜题狗2(driver 版) :param k: :return: """ # 只获取第一页数据 k = '社会主义核心' driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, load_images=False, logger=self.lg, user_agent_type=PHONE, ip_pool_type=self.ip_pool_type, ) # 输入框选择器 input_css_sel = 'input#scform_srchtxt' submit_btn_sel = 'button#scform_submit' body = driver.get_url_body( url='http://www.etkz.cn/search.php?mod=forum', css_selector=submit_btn_sel, timeout=20,) assert body != '' # self.lg.info(body) driver.find_element(value=input_css_sel).send_keys(k) driver.find_element(value=submit_btn_sel).click() sleep(5.) body = Requests._wash_html(driver.page_source) assert body != '' self.lg.info(body) try: del driver except: pass question_item_sel = { 'method': 'css', 'selector': 'div#threadlist ul li', } question_desc_div_sel = { 'method': 're', 'selector': '问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)更多相关问题', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] # pprint(question_item) res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] stg2, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, 'res': res, }
def get_xmtk_search_res(self, k: str, default_sort_value: int=None) -> dict: """ 熊猫题库 :param k: :param page_num: :param default_sort_value: :return: """ # 只能获取第一页数据 headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) headers.update({ 'Referer': 'http://www.lmcv.cn/', }) params = ( ('s', k), ) body = Requests.get_url_body( url='http://www.lmcv.cn/', headers=headers, params=params, verify=False, ip_pool_type=self.ip_pool_type, # proxy_type=PROXY_TYPE_HTTPS, # 不支持https, 因此偶尔高并发会无数据 num_retries=self.req_num_retries, timeout=self.req_timeout,) assert body != '' # self.lg.info(body) question_item_sel = { 'method': 'css', 'selector': 'span.art-main', } question_desc_div_sel = { 'method': 're', 'selector': '问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)更多相关问题', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] # pprint(question_item) res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] xmtk, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, 'res': res, }