Exemplo n.º 1
0
def _init_chrome(is_headless=True, is_pic=True, is_proxy=True):
    '''
    如果使用chrome请设置page_timeout=30
    :return:
    '''
    from selenium.webdriver.support import ui
    from selenium import webdriver

    CHROME_DRIVER_PATH = '/Users/afa/myFiles/tools/chromedriver'
    print('--->>>初始化chrome驱动中<<<---')
    chrome_options = webdriver.ChromeOptions()
    if is_headless:
        chrome_options.add_argument('--headless')  # 注意: 设置headless无法访问网页
    # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument(
        '--no-sandbox'
    )  # required when running as root user. otherwise you would get no sandbox errors.

    # chrome_options.add_argument('window-size=1200x600')   # 设置窗口大小

    # 设置无图模式
    if is_pic:
        prefs = {
            'profile.managed_default_content_settings.images': 2,
        }
        chrome_options.add_experimental_option("prefs", prefs)

    # 设置代理
    if is_proxy:
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip().replace(
            'http://', '') if isinstance(ip_object._get_random_proxy_ip(),
                                         str) else ''
        if proxy_ip != '':
            chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))
    '''无法打开https解决方案'''
    # 配置忽略ssl错误
    capabilities = webdriver.DesiredCapabilities.CHROME.copy()
    capabilities['acceptSslCerts'] = True
    capabilities['acceptInsecureCerts'] = True

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    # 修改user-agent
    chrome_options.add_argument('--user-agent={0}'.format(user_agent))

    # 忽视证书错误
    chrome_options.add_experimental_option('excludeSwitches',
                                           ['ignore-certificate-errors'])

    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
                              chrome_options=chrome_options,
                              desired_capabilities=capabilities)
    wait = ui.WebDriverWait(driver, 30)  # 显示等待n秒, 每过0.5检查一次页面是否加载完毕
    print('------->>>初始化完毕<<<-------')

    return driver
Exemplo n.º 2
0
    def _init_chrome(self):
        '''
        如果使用chrome请设置page_timeout=30
        :return:
        '''
        print('--->>>初始化chrome驱动中<<<---')
        chrome_options = webdriver.ChromeOptions()
        # chrome_options.add_argument('--headless')     # 注意: 设置headless无法访问网页
        # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument(
            '--no-sandbox'
        )  # required when running as root user. otherwise you would get no sandbox errors.

        # chrome_options.add_argument('window-size=1200x600')   # 设置窗口大小

        # 设置无图模式
        prefs = {
            'profile.managed_default_content_settings.images': 2,
        }
        chrome_options.add_experimental_option("prefs", prefs)

        # 设置代理
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip().replace(
            'http://', '') if isinstance(ip_object._get_random_proxy_ip(),
                                         str) else ''
        if proxy_ip != '':
            chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))
        '''无法打开https解决方案'''
        # 配置忽略ssl错误
        capabilities = webdriver.DesiredCapabilities.CHROME.copy()
        capabilities['acceptSslCerts'] = True
        capabilities['acceptInsecureCerts'] = True

        # 修改user-agent
        chrome_options.add_argument('--user-agent={0}'.format(HEADERS[randint(
            0,
            len(HEADERS) - 1)]))

        # 忽视证书错误
        chrome_options.add_experimental_option('excludeSwitches',
                                               ['ignore-certificate-errors'])

        self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
                                       chrome_options=chrome_options,
                                       desired_capabilities=capabilities)
        wait = ui.WebDriverWait(self.driver, 30)  # 显示等待n秒, 每过0.5检查一次页面是否加载完毕
        print('------->>>初始化完毕<<<-------')
Exemplo n.º 3
0
    def from_ip_pool_set_proxy_ip_to_phantomjs(self):
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip()
        if not proxy_ip:  # 失败返回False
            return False

        # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip))
        proxy_ip = re.compile(r'http://').sub('', proxy_ip)  # 过滤'http://'
        proxy_ip = proxy_ip.split(':')  # 切割成['xxxx', '端口']

        try:
            tmp_js = {
                'script':
                'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]),
                'args': []
            }
            self.driver.command_executor._commands['executePhantomScript'] = (
                'POST', '/session/$sessionId/phantom/execute')
            self.driver.execute('executePhantomScript', tmp_js)

        except Exception:
            print('动态切换ip失败')
            return False

        return True
Exemplo n.º 4
0
async def get_taobao_sign_and_body(base_url,
                                   headers: dict,
                                   params: dict,
                                   data: json,
                                   timeout=13,
                                   _m_h5_tk='undefine',
                                   session=None,
                                   logger=None):
    '''
    得到淘宝带签名sign接口数据
    :param base_url:
    :param headers:
    :param params:
    :param data:
    :param timeout:
    :param _m_h5_tk:
    :param session:
    :return: (_m_h5_tk, session, body)
    '''
    sign, t = await calculate_right_sign(data=data, _m_h5_tk=_m_h5_tk)
    headers['Host'] = re.compile(r'://(.*?)/').findall(base_url)[0]
    params.update(
        {  # 添加下面几个query string
            't': t,
            'sign': sign,
            'data': data,
        })

    # 设置代理ip
    ip_object = MyIpPools()
    proxy = ip_object._get_random_proxy_ip()  # 失败返回False

    tmp_proxies = {
        'http': proxy,
    }

    if session is None:
        session = requests.session()
    else:
        session = session
    try:
        response = session.get(url=base_url,
                               headers=headers,
                               params=params,
                               proxies=tmp_proxies,
                               timeout=timeout)
        _m_h5_tk = response.cookies.get('_m_h5_tk', '')
        _m_h5_tk = _m_h5_tk.split('_')[0]
        # print(s.cookies.items())
        # print(_m_h5_tk)

        body = response.content.decode('utf-8')
        # print(body)

    except Exception as e:
        logger.exception(e)
        _m_h5_tk = ''
        body = ''

    return (_m_h5_tk, session, body)
Exemplo n.º 5
0
    async def get_proxy(self):
        '''
        异步获取proxy
        :return: 格式: 'http://ip:port'
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        proxy = ip_object._get_random_proxy_ip()    # 失败返回False

        return proxy
Exemplo n.º 6
0
# chrome_options.add_argument('--proxy-server=http://183.136.218.253:80')
chrome_options.add_argument('--headless')  # 注意: 设置headless无法访问网页
chrome_options.add_argument('--disable-gpu')

# 设置无图模式
prefs = {'profile.managed_default_content_settings.images': 2}
chrome_options.add_experimental_option("prefs", prefs)
'''无法打开https解决方案'''
# 配置忽略ssl错误
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
capabilities['acceptSslCerts'] = True
capabilities['acceptInsecureCerts'] = True

# 方法1: 设置代理
ip_object = MyIpPools()
proxy_ip = ip_object._get_random_proxy_ip().replace(
    'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
if proxy_ip != '':
    chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))

# 方法2:
# ip_object = MyIpPools()
# proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
# # Change the proxy properties of that copy.
# capabilities['proxy'] = {
#     "httpProxy": proxy_ip,
#     "ftpProxy": proxy_ip,
#     "sslProxy": proxy_ip,
#     "noProxy": None,
#     "proxyType": "MANUAL",
#     "class": "org.openqa.selenium.Proxy",
#     "autodetect": False,