def _init_chrome(is_headless=True, is_pic=True, is_proxy=True): ''' 如果使用chrome请设置page_timeout=30 :return: ''' from selenium.webdriver.support import ui from selenium import webdriver CHROME_DRIVER_PATH = '/Users/afa/myFiles/tools/chromedriver' print('--->>>初始化chrome驱动中<<<---') chrome_options = webdriver.ChromeOptions() if is_headless: chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( '--no-sandbox' ) # required when running as root user. otherwise you would get no sandbox errors. # chrome_options.add_argument('window-size=1200x600') # 设置窗口大小 # 设置无图模式 if is_pic: prefs = { 'profile.managed_default_content_settings.images': 2, } chrome_options.add_experimental_option("prefs", prefs) # 设置代理 if is_proxy: ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' # 修改user-agent chrome_options.add_argument('--user-agent={0}'.format(user_agent)) # 忽视证书错误 chrome_options.add_experimental_option('excludeSwitches', ['ignore-certificate-errors']) driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options, desired_capabilities=capabilities) wait = ui.WebDriverWait(driver, 30) # 显示等待n秒, 每过0.5检查一次页面是否加载完毕 print('------->>>初始化完毕<<<-------') return driver
def _init_chrome(self): ''' 如果使用chrome请设置page_timeout=30 :return: ''' print('--->>>初始化chrome驱动中<<<---') chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( '--no-sandbox' ) # required when running as root user. otherwise you would get no sandbox errors. # chrome_options.add_argument('window-size=1200x600') # 设置窗口大小 # 设置无图模式 prefs = { 'profile.managed_default_content_settings.images': 2, } chrome_options.add_experimental_option("prefs", prefs) # 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 修改user-agent chrome_options.add_argument('--user-agent={0}'.format(HEADERS[randint( 0, len(HEADERS) - 1)])) # 忽视证书错误 chrome_options.add_experimental_option('excludeSwitches', ['ignore-certificate-errors']) self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options, desired_capabilities=capabilities) wait = ui.WebDriverWait(self.driver, 30) # 显示等待n秒, 每过0.5检查一次页面是否加载完毕 print('------->>>初始化完毕<<<-------')
def from_ip_pool_set_proxy_ip_to_phantomjs(self): ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip() if not proxy_ip: # 失败返回False return False # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip)) proxy_ip = re.compile(r'http://').sub('', proxy_ip) # 过滤'http://' proxy_ip = proxy_ip.split(':') # 切割成['xxxx', '端口'] try: tmp_js = { 'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]), 'args': [] } self.driver.command_executor._commands['executePhantomScript'] = ( 'POST', '/session/$sessionId/phantom/execute') self.driver.execute('executePhantomScript', tmp_js) except Exception: print('动态切换ip失败') return False return True
async def get_taobao_sign_and_body(base_url, headers: dict, params: dict, data: json, timeout=13, _m_h5_tk='undefine', session=None, logger=None): ''' 得到淘宝带签名sign接口数据 :param base_url: :param headers: :param params: :param data: :param timeout: :param _m_h5_tk: :param session: :return: (_m_h5_tk, session, body) ''' sign, t = await calculate_right_sign(data=data, _m_h5_tk=_m_h5_tk) headers['Host'] = re.compile(r'://(.*?)/').findall(base_url)[0] params.update( { # 添加下面几个query string 't': t, 'sign': sign, 'data': data, }) # 设置代理ip ip_object = MyIpPools() proxy = ip_object._get_random_proxy_ip() # 失败返回False tmp_proxies = { 'http': proxy, } if session is None: session = requests.session() else: session = session try: response = session.get(url=base_url, headers=headers, params=params, proxies=tmp_proxies, timeout=timeout) _m_h5_tk = response.cookies.get('_m_h5_tk', '') _m_h5_tk = _m_h5_tk.split('_')[0] # print(s.cookies.items()) # print(_m_h5_tk) body = response.content.decode('utf-8') # print(body) except Exception as e: logger.exception(e) _m_h5_tk = '' body = '' return (_m_h5_tk, session, body)
async def get_proxy(self): ''' 异步获取proxy :return: 格式: 'http://ip:port' ''' # 设置代理ip ip_object = MyIpPools() proxy = ip_object._get_random_proxy_ip() # 失败返回False return proxy
# chrome_options.add_argument('--proxy-server=http://183.136.218.253:80') chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 chrome_options.add_argument('--disable-gpu') # 设置无图模式 prefs = {'profile.managed_default_content_settings.images': 2} chrome_options.add_experimental_option("prefs", prefs) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 方法1: 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) # 方法2: # ip_object = MyIpPools() # proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' # # Change the proxy properties of that copy. # capabilities['proxy'] = { # "httpProxy": proxy_ip, # "ftpProxy": proxy_ip, # "sslProxy": proxy_ip, # "noProxy": None, # "proxyType": "MANUAL", # "class": "org.openqa.selenium.Proxy", # "autodetect": False,