def crawlerMain(start, over, IPaddress, path, path1, goubanjiaP, max_page): # 设置代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = IPaddress # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) print("第" + str(start) + "次循环开始循环:" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) #开始循环爬虫 for i in range(start, over): print("开始循环-次数:", i) #链接地址 url = "http://hd.chinatax.gov.cn/guoshui/action/GetArticleView1.do?id=" + str( i) + "&flag=1" # changeIP = saveWebSitePage(driver, url, i) while changeIP: proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL # 隐式等待5秒,可以自己调节 driver.implicitly_wait(1) #声明全局变量 global IPList #判断ip地址 if len(IPList): # this list is not None IPaddress = IPList[0] del IPList[0] else: # this list is None IPList.extend(getIPList(path, path1, goubanjiaP, max_page)) #没有找到ip地址,重新查找 if not len(IPList): continue IPaddress = IPList[0] del IPList[0] #更换ip地址 proxy.http_proxy = IPaddress print(str(IPaddress) + "正在重新访问-" + url) # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) changeIP = saveWebSitePage(driver, url, i) print("第" + str(start) + "次循环开始循环:" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def set_proxy(self): proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = "127.0.0.1:1080" proxy.socks_proxy = "127.0.0.1:1080" self.capabilities = webdriver.DesiredCapabilities.CHROME proxy.add_to_capabilities(self.capabilities)
def get_web(ip_list, url_str): if ip_list != []: for ip_one in ip_list: # 访问 driver = webdriver.PhantomJS() proxy = webdriver.Proxy() proxy.proxy_type = 'ProxyType.MANUAL' proxy.http_proxy = ip_one proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) try: driver.get(url_str) except Exception as e: print 'bad ip' + str(e) driver.quit() continue web_scrolltop_number = 0 for auto_read_times in xrange(0, 5): web_scrolltop_number += random.uniform(300, 600) time.sleep(random.uniform(2, 3)) js = "var q=document.documentElement.scrollTop=" + str( web_scrolltop_number) driver.execute_script(js) time.sleep(random.uniform(2, 4)) js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) time.sleep(random.uniform(2, 4)) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) time.sleep(random.uniform(3, 4)) driver.quit() print "success!" else: print 'ip_list is null!'
def set_proxy(self): from selenium.webdriver.common.proxy import ProxyType from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # spider proxy, and check alive pObj = ProxyIPCheck() proxyinfo = pObj.proxyIPGet_XiciDaili() if len(proxyinfo) < 1: print("proxyIPGet failed!!") myproxies = ["220.189.249.80:80", "124.248.32.43:80"] else: myproxies = pObj.validIPGet(proxyinfo) user_agents = [ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" ] desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(user_agents)) desired_capabilities["phantomjs.page.settings.loadImages"] = False proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = random.choice(myproxies) print("Proxy IP: ", proxy.http_proxy) proxy.add_to_capabilities(desired_capabilities) return desired_capabilities
def crawlData(self, url): #设置phantomjs desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities["phantomjs.page.settings.userAgent"] = ( config.get_header()) # 不载入图片,爬页面速度会快很多 desired_capabilities["phantomjs.page.settings.loadImages"] = False # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = random.choice(ips) # proxy.add_to_capabilities(desired_capabilities) # 打开带配置信息的phantomJS浏览器 # driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities) driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities) driver.start_session(desired_capabilities) # 隐式等待5秒,可以自己调节 driver.implicitly_wait(5) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(20) # 设置10秒脚本超时时间 driver.set_script_timeout(20) #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver') driver.get(url) driver.implicitly_wait(1) driver.find_element_by_xpath( '//div[@class="house-chat-phone"]').click() html = driver.page_source return html
def getdaxiangdailiIP(): driver = webdriver.PhantomJS( executable_path=r'C:\Users\wangquan\phantomjs\bin\phantomjs.exe') # 设置页面加载超时 daxiangurl="http://tvp.daxiangdaili.com/ip/?tid=556249540865397&num=1&protocol=http" driver.set_page_load_timeout(5) # 还原系统代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT # 代理ip地址 # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) time.sleep(1.5) Retuenip="" while True: driver.get(daxiangurl) Retuenip=filter_tags(driver.page_source) if ip_exist(Retuenip): print("正确获取ip地址-开始爬虫:"+Retuenip) break else: print("获取ip地址失败正在重新获取") time.sleep(1.5) #返回IP地址 return Retuenip
def send_mail(self, data, proxie): username = data[1] email = data[2].split('@')[0] + '@027168.com' password = data[3] # 创造一个浏览器 browser1 = webdriver.PhantomJS( executable_path=r"/usr/local/bin/phantomjs", ) # //内核 webkkit proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = proxie # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser1.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser1.get('https://pngtree.com/') # 点击注册按钮 browser1.find_element_by_id('base-public-register-button').click() # print browser.page_source browser1.find_element_by_id( 'base-public-login-username-regiser-text').send_keys(username) browser1.find_element_by_id( 'base-public-login-email-regiser-text').send_keys(email) browser1.find_element_by_id( 'base-public-login-password-regiser-text').send_keys(password) browser1.find_element_by_id('base-sub-regist-Btn').click() time.sleep(10) browser1.save_screenshot('regist.png') browser1.quit()
def goubanjiaIP(Pagesize,IPaddress): # 全网代理ip地址网址数组 goubanjia = ['http://www.goubanjia.com/free/gngn/', 'http://www.goubanjia.com/free/gnpt/'] print("开始从《全网代理ip》----中获取免费的代理ip地址") #使用chromedriver模拟页面打开 chromedriver = r"C:\Users\wangquan\chromedriver\chromedriver.exe" os.environ["webdriver.chrome.driver"] = chromedriver driver1 = webdriver.Chrome(chromedriver) #用于存储格式化之后的ip地址 p_pool = [] #设置窗体大小 driver1.set_window_size(0, 0) #设置窗体的位置 driver1.set_window_position(-200,-200) # 设置系统代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL # 代理ip地址 proxy.http_proxy = IPaddress # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver1.start_session(webdriver.DesiredCapabilities.PHANTOMJS) for urlItem in goubanjia: Haserror=1 opt = 1 while opt <=Pagesize: #地址 url=urlItem+"index"+str(opt)+".shtml" try: # 设置爬虫页面超时 driver1.set_page_load_timeout(5) #访问目标地址 driver1.get(url) time.sleep(0.5) #获取页面信息 pageSearch=driver1.page_source #对页面数据进行转码 bobj_2 = BeautifulSoup(pageSearch, "lxml") #获取指定标签数据 sibs = bobj_2.findAll("td", {"class", "ip"}) #开始解析页面 for child in sibs: # 去除html标签,并且判断是否是IP地址 if ('.' in filter_tags(replicFinsh(str(child),'none'))) and (':' in filter_tags(replicFinsh(str(child),'none'))): #添加ip地址到池中 p_pool.append(filter_tags(replicFinsh(str(child), 'none'))) #print(filter_tags(replicFinsh(str(child), 'none'))) opt=opt+1 except Exception as e: Haserror=Haserror+1 #页面等待,之后再次访问页面来获取数据 time.sleep(random.randint(1, 6) * 0.1) #此次页面获取失败 if Haserror==3: opt=1000000 driver1.close() driver1.quit() return p_pool
def get_desired_capabilities(self, spider=None): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(self.user_agent_list)) # 不载入图片,爬页面速度会快很多 desired_capabilities["phantomjs.page.settings.loadImages"] = False # desired_capabilities["phantomjs.page.settings.resourceTimeout"] = 15000 # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url # headers = {'Accept': 'application/json, text/javascript, */*; q=0.01', # 'Accept - Encoding': 'gzip, deflate, sdch', # 'Accept-Language': 'zh-CN,zh;q=0.8', # zh-CN,zh;q=0.8 en-US,en;q=0.8 # 'Cache-Control': 'max-age=0', # 'Connection': 'keep-alive'} # for key, value in headers.iteritems(): # desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value if not spider: return desired_capabilities if not hasattr(spider, "taskJob"): return desired_capabilities taskJob = spider.taskJob if taskJob and taskJob.proxyId: if taskJob.proxyType and taskJob.proxyType == ProxyType.DYNAMIC_FOR_URL: data = ProxyDao.contentDetail(taskJob.proxyId) proxyInfo = data['result']['proxy'] time.sleep(4) proxy_ips = getProxyList(proxyInfo.authUrl) proxy = webdriver.Proxy() proxy.proxy_type = Type.MANUAL proxy.http_proxy = random.choice(proxy_ips) proxy.add_to_capabilities(desired_capabilities) return desired_capabilities
def process_request(self, request, spider): agent = choice(AGENTS) request.headers['User-Agent'] = agent if agent: # 这里填写无忧代理IP提供的API订单号(请到用户中心获取) order = "d168f83eca5a334b2e30fa051bf424f0"; # 获取IP的API接口 apiUrl = "http://api.ip.data5u.com/dynamic/get.html?order=" + str(order)+'&sep=3'; # 获取IP列表 res = urllib.urlopen(apiUrl).read().strip("\n"); # 按照\n分割获取到的IP ips = res.split("\n"); print('proxy is working ip:'+str(ips[0])) # driver = webdriver.PhantomJS(executable_path="D:\Python27\Tools\phantomjs-2.1.1-windows/bin\phantomjs.exe",service_args=['--load-images=false','--disk-cache=true','--proxy={}'.format(ips[0]), '--proxy-type=socks5']) #指定使用的浏览器 # #wait = WebDriverWait(driver, 10) # # driver = webdriver.Firefox() # driver.get(request.url) # time.sleep(70) # #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#j-product-desc > div.ui-box.product-property-main > div.ui-box-title'))) # js = "var q=document.documentElement.scrollTop=10000" # driver.execute_script(js) #可执行js,模仿用户操作。此处为将页面拉至最底端。 # body = driver.page_source # print(body) chromeOptions = webdriver.ChromeOptions()# 设置代理 chromeOptions.add_argument("--proxy-server =http://{}".format(ips[0])) webdriver.Proxy() browser = webdriver.Chrome(executable_path='D:\Python27\Tools\chromedriver_win32/chromedriver.exe',chrome_options = chromeOptions) #wait = WebDriverWait(browser, 10) browser.get(request.url) body = browser.page_source print(body) browser.set_window_size(1400, 900) return HtmlResponse(request.url, body=body, encoding='utf-8', request=request)
def set_proxy(self): proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = random.choice(HTTP_IPS) # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
def dynamic_change_proxy(): browser = webdriver.Chrome( executable_path=r'C:\Users\chenjinwei\source\chromedriver.exe') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxys = getproxy() print(proxys) proxy.http_proxy = proxys # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get('http://httpbin.org/ip') print('1: ', browser.session_id) print('2: ', browser.page_source) print('3: ', browser.get_cookies()) time.sleep(20) proxys = getproxy() print(proxys) proxy.http_proxy = proxys print('second time workking') # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get('http://httpbin.org/ip') print('1: ', browser.session_id) print('2: ', browser.page_source) print('3: ', browser.get_cookies())
def dynamic_load(url): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 desired_capabilities[ "phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0" # 不载入图片,爬页面速度会快很多 desired_capabilities["phantomjs.page.settings.loadImages"] = False # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = random.choice(redis_conn1()) proxy.add_to_capabilities(desired_capabilities) # 打开带配置信息的phantomJS浏览器 driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities) # driver = webdriver.PhantomJS() # driver.start_session(desired_capabilities) # 隐式等待5秒,可以自己调节 driver.implicitly_wait(5) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(100) # 设置10秒脚本超时时间 driver.set_script_timeout(100) driver.get(url) # next_page=driver.find_element_by_id (idd)#.get_attribute('href') # driver.get(next_page) # next_page # html = BeautifulSoup(driver.page_source, 'xml').prettify() print driver.page_source
def open_browser(proxy=None, download=None): #profile = webdriver.FirefoxProfile(r'/home/xling/.mozilla/firefox/nw3oghgt.auto/') profile = webdriver.FirefoxProfile( r'/home/xling/.mozilla/firefox/j0sto346.auto/') profile.native_events_enabled = True if download: profile.set_preference("browser.download.dir", download) # profile.set_preference("browser.download.useDownloadDir", "true"); # profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain, application/vnd.ms-excel, text/csv, text/comma-separated-values, application/octet-stream") if proxy is not None: raw = { 'proxyType': { 'ff_value': 1, 'string': 'manual' }, 'httpProxy': proxy, 'sslProxy': proxy } proxy = webdriver.Proxy(raw) profile.set_proxy(proxy) #browser = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy()) browser = webdriver.Firefox(profile, executable_path="/usr/bin/geckodriver") #brwser.implicitly_wait(60) return browser
def get_phantomjs_driver(strategy): cap = webdriver.DesiredCapabilities.PHANTOMJS cap['phantomjs.page.settings.resourceTimeout'] = '60000' cap['phantomjs.page.settings.loadImages'] = True cap['phantomjs.page.settings.userAgent'] = \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/538.1 "\ "(KHTML, like Gecko) Safari/538.1" if strategy.get('user-agent'): cap['phantomjs.page.settings.userAgent'] = strategy.get('user-agent') if strategy.get('referer'): # logger.debug('add referer: %s' % strategy.get('referer')) cap["phantomjs.page.customHeaders.Referer"] = strategy.get('referer') if strategy.get('cookie'): # logger.debug('use cookie {}'.format(strategy.get('cookie'))) cap["phantomjs.page.customHeaders.Cookie"] = strategy.get('cookie') proxy = webdriver.Proxy() if strategy.get('proxy'): # logger.info('get proxy from {}'.format(cf.proxy_url)) proxy_url = requests.get(cf.proxy_url).text # logger.debug('use proxy {}'.format(proxy_url)) if not proxy_url: raise Exception('fetch proxy empty, sleep 3s to retry') proxy.proxy_type = webdriver.common.proxy.ProxyType.MANUAL proxy.http_proxy = proxy_url else: proxy.proxy_type = webdriver.common.proxy.ProxyType.SYSTEM proxy.add_to_capabilities(cap) driver = webdriver.PhantomJS(executable_path='./phantomjs') driver.implicitly_wait(60) driver.set_page_load_timeout(60) #driver.set_window_size(4096, 2160) return driver
def browser_get(url, http_proxy): count = len(headers_list) index = random.randint(0, count - 1) headers = headers_list[index] for key, value in headers.items(): if key != 'User-Agent': webdriver.DesiredCapabilities.PHANTOMJS[ 'phantomjs.page.customHeaders.{}'.format(key)] = value webdriver.DesiredCapabilities.PHANTOMJS[ 'phantomjs.page.settings.userAgent'] = headers['User-Agent'] driver = webdriver.PhantomJS(executable_path=browser_path) driver.implicitly_wait(10) driver.set_page_load_timeout(10) proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = http_proxy # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) log.info('starting open {}'.format(url)) driver.get(url) # log.info('0: {}'.format(url)) log.info('1: {}'.format(driver.session_id)) # log.info('2: {}'.format(driver.page_source)) print('2:', driver.page_source)
def selenium_proxy(self): """ Returns a Selenium WebDriver Proxy class with details of the HTTP Proxy """ return webdriver.Proxy({ "httpProxy": self.proxy(), "sslProxy": self.proxy(), })
def setProxy(self, proxyStr): # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = proxyStr # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
def use_proxy(self, url): proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = url proxy.ssl_proxy = url # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(self.dcap) return self
def getXiCiIP(PageSize, IPaddress): driver = webdriver.PhantomJS( executable_path=r'C:\Users\wangquan\phantomjs\bin\phantomjs.exe') # 设置系统代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL # 代理ip地址 proxy.http_proxy = IPaddress # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) #设置爬虫页面超时 driver.set_page_load_timeout(5) #从西次进行爬虫 p_pool = [] xici_page = 1 while xici_page <= PageSize: new_count = 0 xici_url = 'http://www.xicidaili.com//wt/' + str(xici_page) try: max_wait = 5 # 20s driver.set_page_load_timeout(max_wait) driver.set_script_timeout(max_wait) driver.get(xici_url) bobj_2 = BeautifulSoup(driver.page_source, "lxml") sibs = bobj_2.findAll('table', {'id': 'ip_list'})[0].tr.next_siblings except Exception as e: try: print('error 1:', e) max_wait = 5 # 20s driver.set_page_load_timeout(max_wait) driver.set_script_timeout(max_wait) driver.get(xici_url) # 等待时长6秒,默认0.5秒询问一次 WebDriverWait(driver, 6) bobj_2 = BeautifulSoup(driver.page_source, "lxml") sibs = bobj_2.findAll('table', {'id': 'ip_list'})[0].tr.next_siblings except Exception as e: print('error 2', e) break for sib in sibs: try: #拼接ip地址 get_proxy = sib.findAll('td')[1].get_text( ) + ':' + sib.findAll('td')[2].get_text() p_pool.append(get_proxy) new_count += 1 except Exception as e: print('error 2', e) break xici_page += 1 # 第几个分页面 return p_pool
def __init__(self, proxy=None): """init the webdriver by setting the proxy and user-agent Args: proxy (str): proxy in the form of ip:port """ self.amazon_index = r'https://www.amazon.com/' self.libPath = os.path.abspath("./lib/geckodriver64.exe") if proxy == None: self.driver = webdriver.Firefox(executable_path=self.libPath) # self.driver = webdriver.Chrome(executable_path=self.libPath) # self.driver = webdriver.PhantomJS(executable_path=self.libPath) self.proxy = "" else: self.proxy = proxy ip, port = proxy.split(':') profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", ip) profile.set_preference("network.proxy.http_port", int(port)) profile.set_preference("network.proxy.ssl", ip) profile.set_preference("network.proxy.ssl_port", int(port)) profile.set_preference("browser.tabs.remote.autostart.2", False) profile.set_preference('permissions.default.image', 2) profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') # for auth # profile.add_extension(self.auth_proxy_path) # credentials = 'User-002:fcg1994' # credentials = b64encode(credentials.encode('ascii')).decode('utf-8') # profile.set_preference('extensions.closeproxyauth.authtoken', credentials) # profile.set_preference('permissions.default.image', 2) # profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') # # save identify code # profile.set_preference('browser.download.folderList', 2) # profile.set_preference('browser.download.manager.showWhenStarting', False) # profile.set_preference('browser.download.dir','./verifyCode/images') # profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'image/jpeg') # set user_agent # profile.set_preference("general.useragent.override", generate_user_agent()) profile.update_preferences() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=%s' % proxy) proxy_driver = webdriver.Proxy() proxy_driver.proxy_type = ProxyType.MANUAL proxy_driver.http_proxy = proxy # self.driver = webdriver.PhantomJS(self.libPath) # proxy_driver.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) # self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) self.driver = webdriver.Firefox(executable_path=self.libPath, firefox_profile=profile) # self.driver = webdriver.Chrome(executable_path=self.libPath,chrome_options=chrome_options) log_info('current proxy: %s' % proxy)
def build_browser(agent, http_proxy): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities["phantomjs.page.settings.userAgent"] = agent desired_capabilities["phantomjs.page.settings.loadImages"] = False proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = http_proxy proxy.add_to_capabilities(desired_capabilities) return driver, desired_capabilities
def parse(self, response): service_args = [] service_args.append('--load-images=no') ##关闭图片加载 service_args.append('--disk-cache=yes') ##开启缓存 service_args.append('--ignore-ssl-errors=true') ##忽略https错误 browser = webdriver.PhantomJS(service_args=service_args) count = 0 while 1 == 1: print(count) proxyIp = requests.get( "http://tvp.daxiangdaili.com/ip/?tid=557895172920514&num=1&filter=on" ).content # thisip = str(IPPools.get_proxy(), encoding="utf-8") thisip = str(proxyIp, encoding="utf-8") try: #telnetlib.Telnet('127.0.0.1', port='80', timeout=20) requests.get( 'http://vote1.qblife.com.cn/vote24/survey/7?from=timeline&isappinstalled=0', proxies={"http": "http://" + thisip}, timeout=1) except: print('connect failed') else: print('success') try: proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = thisip proxy.add_to_capabilities( webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session( webdriver.DesiredCapabilities.PHANTOMJS) browser.get( "http://vote1.qblife.com.cn/vote24/survey/7?from=timeline&isappinstalled=0" ) browser.set_page_load_timeout(3) #print(browser.page_source) elem = browser.find_element_by_id("loadmore") elem.click() elem = browser.find_element_by_id("vote-btn-178") elem.click() except: print('获取不到元素') else: num = browser.find_element_by_id("vote-num-178").text print('点赞成功' + num) count = count + 1
def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/PicCheckCode1/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) return driver
def check(self): driver = webdriver.PhantomJS() proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = '1.9.171.51:800' ips = self.allip() i = random.randint(0, len(ips)) endport = ips[i] proxy.http_proxy = endport.ip + ':' + endport.port driver.get("http://www.baidu.com") soup = BeautifulSoup(driver.page_source, 'lxml') title = soup.find_all('title') print(title)
def driver(self): """create a browser""" if self.headless == True: options = webdriver.FirefoxOptions() options.set_headless() # options=None options.add_argument('headless') options.add_argument('--disable-gpu') if self.proxies: proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': self.proxy() # 代理ip和端口 }) browser_driver = webdriver.Firefox(firefox_options=options, proxy=proxy) else: browser_driver = webdriver.Firefox(firefox_options=options) elif self.headless == "PhantomJS": desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities["phantomjs.page.settings.userAgent"] = ua() desired_capabilities["phantomjs.page.settings.loadImages"] = False if self.proxies: proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = self.proxy() proxy.add_to_capabilities(desired_capabilities) browser_driver = webdriver.PhantomJS( executable_path=self.phantomjs_driver_path, desired_capabilities=desired_capabilities, service_args=[ '--ignore-ssl-errors=true', "--cookies-file=cookie.txt" ]) else: browser_driver = webdriver.PhantomJS( executable_path=self.phantomjs_driver_path, desired_capabilities=desired_capabilities, service_args=[ '--ignore-ssl-errors=true', "--cookies-file=cookie.txt" ]) else: if self.proxies: proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': self.proxy() # 代理ip和端口 }) browser_driver = webdriver.Firefox(proxy=proxy) else: browser_driver = webdriver.Firefox() browser_driver.set_page_load_timeout(self.timeout) browser_driver.set_script_timeout(self.timeout) return browser_driver
def login_with_cookies(self, login_url, cookies_data, domain, browser='foxfire'): if browser == 'foxfire': # 选择火狐浏览器 profile = webdriver.FirefoxProfile() # 火狐的配置文件类 if self.proxy is not None: # 判断是否使用代理,如果使用则获取ip和端口 ip = self.proxy.split(':')[0] port = self.proxy.split(':')[1] profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", ip) # 默认代理方式为http,可以修改 profile.set_preference("network.proxy.http_port", port) driver = webdriver.Firefox(executable_path=self.firefox_path, firefox_profile=profile) elif browser == 'chrome': # 选择谷歌浏览器 options = webdriver.ChromeOptions() # 谷歌浏览器的配置选项类 if self.proxy is not None: # 判断是否使用代理 options.add_argument('--proxy-server=http://' + self.proxy) driver = webdriver.Chrome(executable_path=self.chrome_path, chrome_options=options) elif browser == 'phantomjs': # 选择phantomjs浏览器 desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() if self.proxy is not None: proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = self.proxy proxy.add_to_capabilities(desired_capabilities) desired_capabilities[ "phantomjs.page.settings.loadImages"] = False # 禁止加载图片,可以提高速度 driver = webdriver.PhantomJS( executable_path=self.phantomjs_path, desired_capabilities=desired_capabilities) else: print u'浏览器类型不存在' return None driver.get(login_url) # 添加cookies driver.delete_all_cookies() for cookie in cookies_data.items(): driver.add_cookie({ 'domain': domain, 'name': cookie[0], 'value': cookie[1], 'path': '/', 'expires': None }) return driver
def get_data(shop_id): url = 'http://www.dianping.com/shop/6232395' # load PhantomJS driver = webdriver.PhantomJS() proxy_list = redis_conn1() if proxy_list: print proxy_list desired_capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy() desired_capabilities[ "phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0" # desired_capabilities[ # "phantomjs.page.settings.cookies"] = ''' _hc.v="\"0f6e7827-bc24-4e37-a02d-22712123f3b9.1487061681\""; cy=2; cye=beijing; __utma=1.2016944627.1503913333.1508380646.1508410598.4; __utmz=1.1508380646.3.3.utmcsr=dianping.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __mta=218390182.1508410622795.1508410622795.1508411087614.2; s_ViewType=10; aburl=1; _lxsdk_cuid=15f34478accc8-0036528b7bf6dd-49576f-13c680-15f34478accc8; _lxsdk=15f34478accc8-0036528b7bf6dd-49576f-13c680-15f34478accc8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; JSESSIONID=B6B683D3F115418041ED0AFCE0971147; _lxsdk_s=15f77681817-db4-bf6-14e%7C%7C13''' proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = random.choice(proxy_list) # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(desired_capabilities) driver.start_session(desired_capabilities) driver.add_cookie({ 'path': '/', 'name': 'JSESSIONID', 'value': 'B6B683D3F115418041ED0AFCE0971147;', 'domain': '.dianping.com' }) # for key,value in cookies.items(): # driver.add_cookie({ # 'name': key, # 'value': value, # 'path': '/', # 'domain': '.dianping.com' # }) # print driver.get_cookies() print driver.desired_capabilities driver.set_page_load_timeout(5) driver.get(url) print driver.get_cookies() # start Scrollbar js1 = 'return document.body.scrollHeight' js2 = 'window.scrollTo(0, document.body.scrollHeight)' old_scroll_height = 0 while (driver.execute_script(js1) > old_scroll_height): old_scroll_height = driver.execute_script(js1) driver.execute_script(js2) time.sleep(3) # get url by xpath print driver.page_source list1 = driver.find_elements_by_xpath( '//div[@class="comment-condition J-comment-condition Fix"]/div/span/a') for l in list1: print l.text
def main(): # browser = webdriver.PhantomJS() # Be OK in command line, but not in PyCharm. # browser = webdriver.PhantomJS(r"/home/lxw/Downloads/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs") browser = webdriver.Chrome( r"/home/lxw/Software/chromedirver_selenium/chromedriver") # OK browser.get("http://ipecho.net/plain") print('session_id: ', browser.session_id) print('page_source: ', browser.page_source) print('cookie: ', browser.get_cookies()) print("----" * 10, "\n") # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL # req = requests.get("http://datazhiyuan.com:60001/plain", timeout=10) req = requests.get("http://localhost:60001/plain", timeout=10) print("Get an IP proxy:", req.text) if req.text: proxy.http_proxy = req.text # '1.9.171.51:800' # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get("http://ipecho.net/plain") print('session_id: ', browser.session_id) print('page_source: ', browser.page_source) print('cookie: ', browser.get_cookies()) print("----" * 10, "\n") # 还原为系统代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get("http://ipecho.net/plain") print('session_id: ', browser.session_id) print('page_source: ', browser.page_source) print('cookie: ', browser.get_cookies()) print("----" * 10, "\n")
def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/cdwsjb\/CaptchaImg.png/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) # 随便访问一个相同host的地址,方便之后设置cookie driver.get('https://gr.cdhrss.gov.cn:442/xxxx') return driver