Пример #1
0
    def login_bg(self, driver: BaseDriver):
        """
        login
        :return:
        """
        self.lg.info('login ...')
        body = driver.get_url_body(
            url=self.publish_url,
            timeout=30,
        )
        try:
            assert body != ''
            driver.find_element(value='input#loginName').send_keys(
                self.yx_username)
            driver.find_element(value='input#loginPwd').send_keys(
                self.yx_password)
            driver.find_element(value='button#subbut').click()
        except (
                NoSuchElementException,
                SeleniumTimeoutException,
                AssertionError,
                WebDriverException,
                AttributeError,
        ):
            # 抛出登录异常
            raise LoginFailException

        try:
            self.wait_for_recommend_good_label_appear(driver=driver)
        except FZTimeoutError:
            # 进入目标页失败, 则抛出异常!
            raise EnterTargetPageFailException
Пример #2
0
def _test(self):
    driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH)
    url = 'https://httpbin.org/get'
    body = driver.get_url_body(url=url)
    # lg.info(str(body))
    try:
        data = json_2_dict(re.compile('<pre.*?>(.*)</pre>').findall(body)[0], default_res={})
    except IndexError:
        return {}
    del driver

    return data
Пример #3
0
    async def _get_html_by_driver(self, url, load_images=False):
        '''
        使用driver获取异步页面
        :return:
        '''
        driver = BaseDriver(
            executable_path=self.driver_path,
            ip_pool_type=self.ip_pool_type,
            load_images=load_images)
        body = driver.get_url_body(url=url)
        # self.lg.info(body)
        try:
            del driver
        except:
            pass
        collect()

        return body
Пример #4
0
def test_driver_change_proxy():
    """
    测试firefox动态切换代理
    :return:
    """
    d = BaseDriver(
        # 可行
        type=PHANTOMJS,
        executable_path=PHANTOMJS_DRIVER_PATH,
        # type=FIREFOX,
        # executable_path=FIREFOX_DRIVER_PATH,

        # 无效
        # type=CHROME,
        # executable_path=CHROME_DRIVER_PATH,
        headless=True,
        driver_use_proxy=True,
        ip_pool_type=tri_ip_pool,
    )
    origin_ip_sel = {'method': 're', 'selector': '\"origin\": \"(.*?)\",'}
    url = 'https://httpbin.org/get'
    # url = 'https://www.baidu.com'

    for index in range(0, 5):
        body = d.get_url_body(
            url=url,
            timeout=20,
            change_proxy=True,
            change_user_agent=True,
        )
        if 'httpbin' in url:
            origin_ip = parse_field(
                parser=origin_ip_sel,
                target_obj=body,
            )
            print('origin_ip: {}'.format(origin_ip))
        else:
            print(body)

    try:
        del d
    except:
        pass
Пример #5
0
def test_driver(
    _type=CHROME,
    headless=True,
    driver_use_proxy=True,
    url: str = 'https://httpbin.org/get',
) -> str:
    if _type == CHROME:
        executable_path = CHROME_DRIVER_PATH
    elif _type == FIREFOX:
        executable_path = FIREFOX_DRIVER_PATH
    elif _type == PHANTOMJS:
        executable_path = PHANTOMJS_DRIVER_PATH
    else:
        raise ValueError('_type value 异常!')

    print('driver_type: {}, executable_path: {}, driver_use_proxy: {}'.format(
        _type, executable_path, driver_use_proxy))
    print('url: {}'.format(url))
    d = BaseDriver(
        type=_type,
        executable_path=executable_path,
        headless=headless,
        driver_use_proxy=driver_use_proxy,
        ip_pool_type=tri_ip_pool,
    )
    body = d.get_url_body(
        url=url,
        timeout=30,
    )
    print(body)

    try:
        del d
    except:
        pass

    return body
Пример #6
0
    def get_stg_search_res2(self, k: str, default_sort_value: int=None) -> dict:
        """
        搜题狗2(driver 版)
        :param k:
        :return:
        """
        # 只获取第一页数据
        k = '社会主义核心'
        driver = BaseDriver(
            executable_path=PHANTOMJS_DRIVER_PATH,
            load_images=False,
            logger=self.lg,
            user_agent_type=PHONE,
            ip_pool_type=self.ip_pool_type,
        )
        # 输入框选择器
        input_css_sel = 'input#scform_srchtxt'
        submit_btn_sel = 'button#scform_submit'
        body = driver.get_url_body(
            url='http://www.etkz.cn/search.php?mod=forum',
            css_selector=submit_btn_sel,
            timeout=20,)
        assert body != ''
        # self.lg.info(body)
        driver.find_element(value=input_css_sel).send_keys(k)
        driver.find_element(value=submit_btn_sel).click()
        sleep(5.)
        body = Requests._wash_html(driver.page_source)
        assert body != ''
        self.lg.info(body)

        try:
            del driver
        except:
            pass

        question_item_sel = {
            'method': 'css',
            'selector': 'div#threadlist ul li',
        }
        question_desc_div_sel = {
            'method': 're',
            'selector': '问题:(.*?)答案:',
        }
        answer_div_sel = {
            'method': 're',
            'selector': '答案:(.*?)更多相关问题',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []
        # pprint(question_item)

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<strong>',
                        '</strong>',
                        '<font .*?>',
                        '</font>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] stg2, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,
            'res': res,
        }
Пример #7
0
    def get_home_page_info_by_page_num(self, page_num: int) -> list:
        """
        根据page_num获取单页的信息
        :param page_num:
        :return:
        """
        def parse_page_info(body) -> list:
            """
            解析
            :param body:
            :return:
            """
            # div item
            li_sel = {
                'method': 'css',
                'selector': 'div.center-wrap a.random_list',
            }
            title_sel = {
                'method': 'css',
                'selector': 'div.random_title ::text',
            }
            create_time_sel = {
                'method': 'css',
                'selector': 'div.date ::text',
            }
            article_img_url_sel = {
                'method': 'css',
                'selector': 'div.random_article img ::attr("data-original")',
            }
            article_img_name_sel = {
                'method': 'css',
                'selector': 'div.random_article img ::attr("alt")',
            }
            li_list = parse_field(
                parser=li_sel,
                target_obj=body,
                is_first=False,
            )
            res = []
            for item in li_list:
                # pprint(item)
                try:
                    title = parse_field(
                        parser=title_sel,
                        target_obj=item,
                    )
                    assert title != ''
                    create_time = parse_field(
                        parser=create_time_sel,
                        target_obj=item,
                    )
                    assert create_time != ''
                    article_img_url_list = parse_field(
                        parser=article_img_url_sel,
                        target_obj=item,
                        is_first=False,
                    )
                    assert article_img_url_list != []
                    article_img_name_list = parse_field(
                        parser=article_img_name_sel,
                        target_obj=item,
                        is_first=False,
                    )
                    assert article_img_name_list != []
                    article_img_list = list(
                        zip(article_img_name_list, article_img_url_list))
                    article_img_list = [{
                        'img_name': i[0],
                        'img_url': i[1],
                    } for i in article_img_list]
                except (AssertionError, IndexError) as e:
                    # print(e)
                    continue

                res.append({
                    'title': title,
                    'create_time': create_time,
                    'article_img_list': article_img_list,
                })

            return res

        headers = self.get_random_phone_headers()
        headers.update({
            'authority': 'www.doutula.com',
            'referer': 'https://www.doutula.com/',
        })
        params = (('page', str(page_num)), )
        url = 'https://www.doutula.com/article/list/'
        # TODO 用requests乱码
        # body = Requests.get_url_body(
        #     url=url,
        #     headers=headers,
        #     params=params,
        #     ip_pool_type=self.ip_pool_type,
        #     num_retries=self.request_num_retries,
        #     encoding='utf-8',)
        # print(body)
        # 改用driver
        d = BaseDriver(ip_pool_type=tri_ip_pool, user_agent_type=PHONE)
        body = d.get_url_body(
            url=_get_url_contain_params(url=url, params=params))
        # print(body)
        try:
            del d
        except:
            pass
        res = parse_page_info(body=body)
        print('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))
        collect()

        return res