def login_bg(self, driver: BaseDriver): """ login :return: """ self.lg.info('login ...') body = driver.get_url_body( url=self.publish_url, timeout=30, ) try: assert body != '' driver.find_element(value='input#loginName').send_keys( self.yx_username) driver.find_element(value='input#loginPwd').send_keys( self.yx_password) driver.find_element(value='button#subbut').click() except ( NoSuchElementException, SeleniumTimeoutException, AssertionError, WebDriverException, AttributeError, ): # 抛出登录异常 raise LoginFailException try: self.wait_for_recommend_good_label_appear(driver=driver) except FZTimeoutError: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException
def _test(self): driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH) url = 'https://httpbin.org/get' body = driver.get_url_body(url=url) # lg.info(str(body)) try: data = json_2_dict(re.compile('<pre.*?>(.*)</pre>').findall(body)[0], default_res={}) except IndexError: return {} del driver return data
async def _get_html_by_driver(self, url, load_images=False): ''' 使用driver获取异步页面 :return: ''' driver = BaseDriver( executable_path=self.driver_path, ip_pool_type=self.ip_pool_type, load_images=load_images) body = driver.get_url_body(url=url) # self.lg.info(body) try: del driver except: pass collect() return body
def test_driver_change_proxy(): """ 测试firefox动态切换代理 :return: """ d = BaseDriver( # 可行 type=PHANTOMJS, executable_path=PHANTOMJS_DRIVER_PATH, # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, # 无效 # type=CHROME, # executable_path=CHROME_DRIVER_PATH, headless=True, driver_use_proxy=True, ip_pool_type=tri_ip_pool, ) origin_ip_sel = {'method': 're', 'selector': '\"origin\": \"(.*?)\",'} url = 'https://httpbin.org/get' # url = 'https://www.baidu.com' for index in range(0, 5): body = d.get_url_body( url=url, timeout=20, change_proxy=True, change_user_agent=True, ) if 'httpbin' in url: origin_ip = parse_field( parser=origin_ip_sel, target_obj=body, ) print('origin_ip: {}'.format(origin_ip)) else: print(body) try: del d except: pass
def test_driver( _type=CHROME, headless=True, driver_use_proxy=True, url: str = 'https://httpbin.org/get', ) -> str: if _type == CHROME: executable_path = CHROME_DRIVER_PATH elif _type == FIREFOX: executable_path = FIREFOX_DRIVER_PATH elif _type == PHANTOMJS: executable_path = PHANTOMJS_DRIVER_PATH else: raise ValueError('_type value 异常!') print('driver_type: {}, executable_path: {}, driver_use_proxy: {}'.format( _type, executable_path, driver_use_proxy)) print('url: {}'.format(url)) d = BaseDriver( type=_type, executable_path=executable_path, headless=headless, driver_use_proxy=driver_use_proxy, ip_pool_type=tri_ip_pool, ) body = d.get_url_body( url=url, timeout=30, ) print(body) try: del d except: pass return body
def get_stg_search_res2(self, k: str, default_sort_value: int=None) -> dict: """ 搜题狗2(driver 版) :param k: :return: """ # 只获取第一页数据 k = '社会主义核心' driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, load_images=False, logger=self.lg, user_agent_type=PHONE, ip_pool_type=self.ip_pool_type, ) # 输入框选择器 input_css_sel = 'input#scform_srchtxt' submit_btn_sel = 'button#scform_submit' body = driver.get_url_body( url='http://www.etkz.cn/search.php?mod=forum', css_selector=submit_btn_sel, timeout=20,) assert body != '' # self.lg.info(body) driver.find_element(value=input_css_sel).send_keys(k) driver.find_element(value=submit_btn_sel).click() sleep(5.) body = Requests._wash_html(driver.page_source) assert body != '' self.lg.info(body) try: del driver except: pass question_item_sel = { 'method': 'css', 'selector': 'div#threadlist ul li', } question_desc_div_sel = { 'method': 're', 'selector': '问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)更多相关问题', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] # pprint(question_item) res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] stg2, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, 'res': res, }
def get_home_page_info_by_page_num(self, page_num: int) -> list: """ 根据page_num获取单页的信息 :param page_num: :return: """ def parse_page_info(body) -> list: """ 解析 :param body: :return: """ # div item li_sel = { 'method': 'css', 'selector': 'div.center-wrap a.random_list', } title_sel = { 'method': 'css', 'selector': 'div.random_title ::text', } create_time_sel = { 'method': 'css', 'selector': 'div.date ::text', } article_img_url_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("data-original")', } article_img_name_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("alt")', } li_list = parse_field( parser=li_sel, target_obj=body, is_first=False, ) res = [] for item in li_list: # pprint(item) try: title = parse_field( parser=title_sel, target_obj=item, ) assert title != '' create_time = parse_field( parser=create_time_sel, target_obj=item, ) assert create_time != '' article_img_url_list = parse_field( parser=article_img_url_sel, target_obj=item, is_first=False, ) assert article_img_url_list != [] article_img_name_list = parse_field( parser=article_img_name_sel, target_obj=item, is_first=False, ) assert article_img_name_list != [] article_img_list = list( zip(article_img_name_list, article_img_url_list)) article_img_list = [{ 'img_name': i[0], 'img_url': i[1], } for i in article_img_list] except (AssertionError, IndexError) as e: # print(e) continue res.append({ 'title': title, 'create_time': create_time, 'article_img_list': article_img_list, }) return res headers = self.get_random_phone_headers() headers.update({ 'authority': 'www.doutula.com', 'referer': 'https://www.doutula.com/', }) params = (('page', str(page_num)), ) url = 'https://www.doutula.com/article/list/' # TODO 用requests乱码 # body = Requests.get_url_body( # url=url, # headers=headers, # params=params, # ip_pool_type=self.ip_pool_type, # num_retries=self.request_num_retries, # encoding='utf-8',) # print(body) # 改用driver d = BaseDriver(ip_pool_type=tri_ip_pool, user_agent_type=PHONE) body = d.get_url_body( url=_get_url_contain_params(url=url, params=params)) # print(body) try: del d except: pass res = parse_page_info(body=body) print('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) collect() return res