def __init__(self): log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log",logstdout=False) log.msg("initiating crawler...",level=log.INFO) self.crawler_id = self.get_crawler_id() log.msg("crawler id is %s" % self.crawler_id,level=log.INFO) self.r.set('crawler:ip:%s' % self.crawler_id,utils.get_external_ip()) self.r.set('crawler:port:%s' % self.crawler_id,settings.REDIS_LOCAL_PORT) self.r.set('crawler:mapping_port:%s' % self.crawler_id,settings.REDIS_LOCAL_MAPPING_PORT) log.msg("crawler ip is %s, port is %d" % (utils.get_external_ip(),settings.REDIS_LOCAL_PORT),level=log.INFO) account = self.get_account() self.username = account[0] self.password = account[1] log.msg("crawler account got",level=log.INFO) self.r_local.set('crawler:status:%s' % self.crawler_id, 'good') self.r_local.set('crawler:update_time:%s' % self.crawler_id, datetime.datetime.utcnow().strftime("%s")) log.msg("local crawler status set",level=log.INFO) heartbeat_thread = threading.Thread(target=self.maintain_local_heartbeat) heartbeat_thread.start() log.msg("local crawler heartbeat started",level=log.INFO) if platform.system() == "Linux": #on linux, use virtual display vdisplay = Xvfb() vdisplay.start() co = ChromeOptions() #TODO: Disable image after log in #TODO: optimize memory usage co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1}}) #co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1,"images":2,"media":2}}) self.driver = webdriver.Chrome(chrome_options=co) self.driver.set_window_size(640,960)
def get_chrome(additional_options: ChromeOptions=None) -> webdriver.Chrome: options = ChromeOptions() if additional_options == None: additional_options = options download_option = {'download.default_directory': get_download_file_path(), 'download.directory_upgrade': 'true', 'download.extensions_to_open': '', } options.add_experimental_option('prefs', download_option) return webdriver.Chrome(get_chrome_exe_path(), desired_capabilities=options.to_capabilities(), chrome_options=additional_options)
def get_cookie(username, password, proxy): """ Launch a chrome to get cookies """ chromeopts = ChromeOptions() if proxy: chromeopts.add_argument('--proxy-server=%s' % proxy) web = Chrome(chrome_options=chromeopts) try: return signin(web, username, password) finally: web.quit()
def setup_for_test(self, test): chrome_options = ChromeOptions() chrome_options.add_argument("test-type") chrome_options.add_argument("disable-infobars") chrome_options.add_experimental_option('prefs', { 'credentials_enable_service': False, 'profile.password_manager_enabled': False, 'profile.default_content_setting_values.plugins': 1, 'profile.content_settings.plugin_whitelist.adobe-flash-player': 1, 'profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player': 1 }) if test.use_proxy: chrome_options.add_argument("--proxy-server={0}".format(test.proxy_address)) self.capabilities = chrome_options.to_capabilities() logger.debug("Chrome capabilities: {}".format(self.capabilities))
def setUpClass(cls): super(SeleniumTestCase, cls).setUpClass() print('Initializing browser engine...') if sys.platform == 'win32': # Chrome hangs up on Windows capabilities = DesiredCapabilities.FIREFOX capabilities['loggingPrefs'] = {'browser': 'ALL'} cls.browser = Firefox(capabilities=capabilities) else: capabilities = DesiredCapabilities.CHROME capabilities['loggingPrefs'] = {'browser': 'ALL'} options = ChromeOptions() options.add_argument('headless') options.add_argument('disable-gpu') cls.browser = Chrome(chrome_options=options, desired_capabilities=capabilities) print('Browser engine initialized.')
def selenium_browser(): # type: () -> Chrome options = ChromeOptions() options.add_argument("headless") options.add_argument("no-sandbox") options.add_argument("window-size=1920,1080") return Chrome(options=options)
def setUp(self, browser): self.browser = browser if "firefox" in self.browser: profile = FirefoxProfile() # profile.set_preference("plugin.state.silverlight", 2) # profile.set_preference("browser.download.folderList", 1) # profile.set_preference("pdfjs.disabled", False); # profile.set_preference("pdfjs.firstRun", True); self.driver = Firefox(profile) # get a new firefox session if "chrome" in self.browser: chromedriver = "/usr/local/bin/chromedriver" options = ChromeOptions() options.add_experimental_option('excludeSwitches', ['disable-component-update']) options.add_argument("--user-data-dir=./browser_resources/chrome_data_dir/") os.environ["webdriver.chrome.driver"] = chromedriver self.driver = Chrome(executable_path=chromedriver, chrome_options=options) self.home_page = home.Home(self.driver)
def login(account, passwd, url): # 如果driver没加入环境变量中,那么就需要明确指定其路径 # 验证于2017年4月11日 # 直接登陆新浪微博 chrome_options = ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') driver = webdriver.Chrome('/root/qk_python/python/data/collect/weibo_spider/priv/chromedriver', chrome_options=chrome_options) driver.maximize_window() driver.set_page_load_timeout(30) driver.set_window_size(1124, 850) # locator = (By.) driver.get(url) print('开始登陆') name_field = driver.find_element_by_id('loginname') name_field.clear() name_field.send_keys(account) password_field = driver.find_element_by_class_name('password').find_element_by_name('password') password_field.clear() password_field.send_keys(passwd) submit = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span') ActionChains(driver).double_click(submit).perform() time.sleep(5) WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'WB_miniblog'))) source = driver.page_source if is_login(source): print('登录成功') sina_cookies = driver.get_cookies() driver.quit() return sina_cookies
def setup_browser(): if use_firefox: world.browser = MyFirefox() world.browser.set_window_size(450, 1200) world.browser.set_window_position(0, 0) #world.browser.maximize_window() elif use_phantomjs: world.browser = MyPhantomJS() elif use_headless_chrome: options = ChromeOptions() options.add_argument("--window-size=1005,9999") options.add_argument("--headless"); world.browser = MyChrome(executable_path=os.path.join('..', '..', 'chromedriver'), chrome_options=options) else: options = ChromeOptions() options.add_argument("--start-maximized"); world.browser = MyChrome(executable_path=os.path.join('..', '..', 'chromedriver'), chrome_options=options) world.da_path = default_path world.wait_seconds = default_wait_seconds
def create_download_dir_capabilities_for_chrome(path_to_download, **extensions_files): """ Example use | ${capabilities} | create_download_dir_capabilities_for_chrome | Artifacts | | Open Browser Extension | https://support.spatialkey.com/spatialkey-sample-csv-data/ | gc | desired_capabilities=${capabilities} | | Click Element | //a[contains(@href,'sample.csv.zip')] | """ path_to_download_check = validate_create_artifacts_dir(path_to_download) chrome_options = ChromeOptions() prefs = {"download.default_directory": path_to_download_check, "directory_upgrade": "true"} chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument("--disable-web-security") for single_extension in extensions_files: chrome_options.add_extension(single_extension) logger.info("Chrome Capabilities set download dir '" + path_to_download_check + "'") return chrome_options.to_capabilities()
def set_chrome_options(self): from selenium.webdriver import ChromeOptions opts = ChromeOptions() ### Add Boolean Arguments if T.has_key('true_opts'): for it in T['true_opts']: opts.add_argument( '%s=1' % it ) if T.has_key('false_opts'): for it in T['false_opts']: opts.add_argument( '%s=0' % it ) value_opts = [ 'profile-directory', 'log-level', # 0 to 3: INFO = 0, WARNING = 1, LOG_ERROR = 2, LOG_FATAL = 3 'net-log-capture-mode', # "Default" "IncludeCookiesAndCredentials" "IncludeSocketBytes"' 'register-font-files', # might be windows only 'remote-debugging-port', 'user-agent', 'user-data-dir', # don't use b/c it negates no-extension options ] ### Add Value Arguments for it in value_opts: if T.has_key(it): opts.add_argument( '%s=%s' % (it,T[it]) ) ### OTHER CHROME OPTIONS NOT YET FULLY CONFIGURED # -extensions list str # -localState dict # -prefs dict # set_profile() # -detach bool # -debuggerAddress str # -excludeSwitches list str # -minidumpPath str # -mobileEmulation dict # -perfLoggingPrefs OBJECT (dict) # set_performance_logging() return opts
def set_spider_option(self, use_proxy=False) -> Chrome: """ ChromeDriver settings @param use_proxy: 使用代理 <当前版本禁用>:部分机场禁止国内ip访问 @return: """ options = ChromeOptions() # 最高权限运行 options.add_argument('--no-sandbox') # 隐身模式 options.add_argument('-incognito') # 无缓存加载 options.add_argument('--disk-cache-') # 设置中文 options.add_argument('lang=zh_CN.UTF-8') options.add_experimental_option('excludeSwitches', ['enable-automation']) # 更换头部 options.add_argument(f'user-agent={get_header()}') if use_proxy: proxy_ip = get_proxy(True) if proxy_ip: options.add_argument(f'proxy-server={proxy_ip}') # 静默启动 if self.silence is True: options.add_argument('--headless') # 无反爬虫机制:高性能启动,禁止图片加载及js动画渲染,加快selenium页面切换效率 def load_anti_module(): chrome_pref = { "profile.default_content_settings": { "Images": 2, 'javascript': 2 }, "profile.managed_default_content_settings": { "Images": 2 } } options.experimental_options['prefs'] = chrome_pref options.add_experimental_option('excludeSwitches', ['enable-automation']) d_c = DesiredCapabilities.CHROME d_c['pageLoadStrategy'] = 'none' return Chrome(options=options, executable_path=CHROMEDRIVER_PATH, desired_capabilities=d_c) if self.anti is False: return load_anti_module() else: # 有反爬虫/默认:一般模式启动 return Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
from selenium.webdriver import Chrome, ChromeOptions import time EMAIL_ID = "<email id here>" def slow_typing(element, text): for character in text: element.send_keys(character) time.sleep(0.3) # Visit chrome://version/ and copy profile path in place of '<chrome user profile>' options = ChromeOptions().add_argument("--user-data-dir=<chrome user profile>") browser = Chrome(chrome_options=options) browser.get('https://www.browserstack.com') time.sleep(2) # to accept cookie notification so that it doesn't interfare cookie_cta = browser.find_element_by_id('accept-cookie-notification') cookie_cta.click() # Navigate to Signup Page button = browser.find_element_by_id('signupModalButton') button.click() time.sleep(2) # Fill user's full name
def create(**kwargs: Any): options = ChromeOptions() options.headless = driver_is_headless driver = create_simple_selenium_web_driver(driver_options=options, **kwargs) drivers.append(driver) return driver
client = pymongo.MongoClient(database_ip, database_port) db = client[database_name] update = UpdateCrawler() new_data = [] data2update = list(db['hotspot'].find()) header_list = [ 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"', 'user-agent="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"', 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"' ] # 头条部分的设置 option = ChromeOptions() prefs = { "profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2 } option.add_experimental_option("prefs", prefs) option.add_experimental_option('excludeSwitches', ['enable-automation']) header = random.choice(header_list) option.add_argument(header) option.add_argument('--headless') option.add_argument('--disable-gpu') # 无头浏览器 driver = Chrome(options=option) num = 0 loop = asyncio.get_event_loop() tasks = []
def __init__(self): chrome_options = ChromeOptions() chrome_options.set_headless(True) super().__init__(chrome_options=chrome_options)
def create_driver(): # load chrome preferences chrome_pref_file = open(chrome_prefs_path, 'r') prefs = json.load(chrome_pref_file) options = ChromeOptions() options.headless = True options.add_experimental_option('prefs', prefs) options.add_argument("disable-infobars") options.add_argument("--disable-extensions") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) return Chrome(options=options, executable_path=chrome_driver_path)
class CrackTouClick(): def __init__(self): self.url = 'https://passport.cnblogs.com/user/signin' self.option = ChromeOptions() self.option.add_experimental_option('excludeSwitches', ['enable-automation']) self.browser = Chrome(options=self.option) self.wait = WebDriverWait(self.browser, 20) self.email = EMAIL self.password = PASSWORD self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID) def open(self): """ 打开网页输入用户名密码 以及点击验证按钮的操作 """ self.browser.get(self.url) email = self.wait.until( EC.presence_of_element_located( (By.ID, 'input1'))).send_keys(self.email) password = self.wait.until( EC.presence_of_element_located( (By.ID, 'input2'))).send_keys(self.password) button = self.wait.until( EC.element_to_be_clickable((By.CLASS_NAME, 'button'))).click() button2 = self.wait.until( EC.element_to_be_clickable( (By.CLASS_NAME, 'geetest_btn'))).click() try: #slider判断是滑块验证码还是字体验证码 slider = self.wait.until( EC.element_to_be_clickable( (By.CLASS_NAME, 'geetest_slider_button'))) CHAOJIYING_KIND = 9202 #验证码类型 print('这是滑块验证码') image = self.get_touclick_image() #获取图片 # 识别验证码 result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND) print('验证码位置', result['pic_str']) print(result) locations = self.get_points(result) self.touch_click_words(locations) print('正在检测错误,此处延迟3秒,以便等待页面加载') time.sleep(3) self.img_error(result) except Exception as e: print('这是字体验证码', e) CHAOJIYING_KIND = 9103 #验证码类型 # 获取验证码图片 image = self.get_touclick_image() #获取图片 bytes_array = BytesIO() image.save(bytes_array, format='PNG') ##保存一张图片来参照 # 识别验证码 result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND) print(result) locations = self.get_points(result) self.touch_click_words2(locations) print('正在检测错误,此处延迟3秒,以便等待页面加载') time.sleep(3) self.img_error(result) def touch_click_verify(self): """ 获取滑块按钮 :return: None """ slider = self.wait.until( EC.element_to_be_clickable( (By.CLASS_NAME, 'geetest_slider_button'))) return slider def get_touclick_element(self): """ 获取验证图片对象 :return: 图片对象 """ element = self.wait.until( EC.presence_of_element_located( (By.CLASS_NAME, 'geetest_canvas_slice'))) return element def get_position(self): """ 获取验证码位置 :return: 验证码位置元组 """ element = self.get_touclick_element() time.sleep(2) location = element.location size = element.size top, bottom, left, right = location['y'], location['y'] + size[ 'height'], location['x'], location['x'] + size['width'] return (top, bottom, left, right) def get_screenshot(self): """ 获取网页截图 :return: 截图对象 """ screenshot = self.browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) return screenshot def get_touclick_image(self, name='captcha.png'): """ 获取验证码图片 :return: 图片对象 """ top, bottom, left, right = self.get_position() screenshot = self.get_screenshot() captcha = screenshot.crop((left, top, right, bottom)) captcha.save(name) return captcha def get_points(self, captcha_result): """ 解析识别结果 :param captcha_result: 识别结果 :return: 转化后的结果 """ groups = captcha_result.get('pic_str').split('|') locations = [[int(number) for number in group.split(',')] for group in groups] return locations def touch_click_words(self, locations): """ 点击滑块验证图片 :param locations: 点击位置 :return: None """ for location in locations: print(location) ActionChains(self.browser).drag_and_drop_by_offset( self.touch_click_verify(), location[0], location[1]).perform() time.sleep(1) def touch_click_words2(self, locations): """ 点击字体验证图片 :param locations: 点击位置 :return: None """ for location in locations: print(location) ActionChains(self.browser).move_to_element_with_offset( self.get_touclick_element(), location[0], location[1]).click().perform() time.sleep(1) def img_error(self, result): #检测验证码有没有出错,这步老是报出栈错误 #无奈之下只能采取解析式来判断登录前后页面的数据了 #不得不吐槽的是,这里的滑块验证码坐标识别率低到了令人发指的地步 test = etree.HTML(self.browser.page_source) title = test.xpath('//*[@id="app_ing"]/text()') print('爬取登陆前后的数据变化', title) if title == []: img_id = result['pic_id'] self.chaojiying.report_error(img_id) print('登录失败,已发送错误验证码') self.open() else: print('登录成功')
def set_spider_option(self, header=None) -> Chrome: """ :param header: :return: """ # 实例化Chrome可选参数 options = ChromeOptions() # 最高权限运行 options.add_argument('--no-sandbox') # 隐身模式 options.add_argument('-incognito') # 无缓存加载 options.add_argument('--disk-cache-') # 设置中文 options.add_argument('lang=zh_CN.UTF-8') # 禁用 DevTools listening options.add_experimental_option('excludeSwitches', ['enable-logging']) options.add_argument('--log-level=3') # 更换头部 if header: options.add_argument(f"user-agent={header}") else: options.add_argument(f'user-agent={get_header()}') # 静默启动 if self.silence is True: options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument("--disable-software-rasterizer") # 抑制自动化控制特征 options.add_argument('--disable-blink-features=AutomationControlled') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-automation']) # 加速模式,增加Selenium渲染效率 if self.assault: chrome_pref = {"profile.default_content_settings": {"Images": 2, 'javascript': 2}, "profile.managed_default_content_settings": {"Images": 2}} options.experimental_options['prefs'] = chrome_pref d_c = DesiredCapabilities.CHROME d_c['pageLoadStrategy'] = 'none' _api = Chrome( options=options, executable_path=CHROMEDRIVER_PATH, desired_capabilities=d_c ) else: _api = Chrome(options=options, executable_path=CHROMEDRIVER_PATH) # 进一步消除操作指令头,增加隐蔽性 _api.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }) return _api
def set_driver(isHeadless=False, isManager=False, isSecret=False, isExtension=False, extension_path='', profile_path=''): options = ChromeOptions() user_agent = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', ] if os.name == 'nt': # Windows driver_path = 'chromedriver.exe' elif os.name == 'posix': # Mac driver_path = 'chromedriver' if isHeadless: options.add_argument('--headless') options.add_argument('--single-process') if isExtension: if extension_path: options.add_extension(extension_path) else: options.add_argument('--disable-extensions') if isSecret: options.add_argument('--incognito') # シークレットモードの設定を付与 else: # プロファイル設定することで、初回手動でログインや拡張機能追加したものを2回目以降使用可能 # シークレットモードではプロファイル設定を使用できない # ヘッドレスモードではプロファイル設定、Chrome拡張機能を使用できない # 拡張機能を有効にして、以下のエラーが出た場合、その拡張機能は使用できない # failed to wait for extension background page to load # その場合は、プロファイル設定にて手動で機能を追加して、ヘッドレスモードかつ拡張機能Enableで使用する if (not isHeadless) or (not isExtension): options.add_argument('--user-data-dir=' + profile_path) options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('log-level=3') options.add_argument('--ignore-ssl-errors') options.add_argument(f'--user-agent={user_agent[random.randrange(0, len(user_agent), 1)]}') options.add_argument('--start-maximized') options.add_argument('--ignore-certificate-errors') options.add_argument('--allow-running-insecure-content') options.add_argument('--disable-web-security') options.add_argument('--disable-desktop-notifications') options.add_argument('--disable-application-cache') options.add_argument('--lang=ja') if isManager: # 自動取得 try: driver = Chrome(ChromeDriverManager().install(), options=options) except InvalidArgumentException as err: logger.error(err) logger.error('既存のブラウザを閉じで実行してください。') return None except Exception as err: logger.error(err) else: # 手動取得 try: path = os.getcwd() + '/' + driver_path driver = Chrome(executable_path=path, options=options) except InvalidArgumentException as err: logger.error(err) logger.error('既存のブラウザを閉じで実行してください。') return None except WebDriverException as err: logger.error(err) logger.error('Chromeと同じバージョンのChrome Driverをダウンロードしてください。') return None return driver
paperinfo['corppersons'] = '' for j in paperco: if j != paperco[-1]: j_text = j.text + ', ' else: j_text = j.text paperinfo['corppersons'] += j_text print(paperinfo) paperinfolist.append(paperinfo) mainplist.append('') except: pass return [ info_field, info_cited, info_achi, info_h, info_g, '', '', '', per_json, namelist, paperinfolist, mainplist, '' ] if __name__ == '__main__': # 使用option option = ChromeOptions() # 创建配置示例 option.add_argument('--headless') # 无头模式,后台启动 # 创建浏览器 browser = Chrome() # (options=option) Url = 'https://www.researchgate.net/profile/Jodie_Abbatangelo-Gray2' print('开始爬取') info = spid(Url, browser) print('爬取完毕:\ninfo:\n', info) # browser.quit()
def get_browser(headless: bool = True, browser_class: int = 1) -> Firefox: """ 获取一个浏览器 :param headless: :param browser_class: 浏览器种类,0是谷歌, 1 是火狐, 服务器端不能使用谷歌 :return: """ """ firefox的headless浏览器 因为headless的浏览器的语言跟随操作系统,为了保证爬回来的数据是正确的语言, 这里必须设置浏览器的初始化参数, 注意,使用headless必须先安装对应浏览器正常的版本,然后再安装headless版本 比如火狐的headless 下载火狐的geckodriver驱动。(当前文件夹下已经有一个了)地址是: https://github.com/mozilla/geckodriver/releases 下载后解压是一个geckodriver 文件。拷贝到/usr/local/bin目录下,然后加上可执行的权限 sudo chmod +x /usr/local/bin/geckodriver chrome的headless浏览器 https://chromedriver.storage.googleapis.com/index.html?path=2.35/ 你也可以自行搜索chromedriver的下载地址,解压是个可执行文件,放到chrome的目录即可. 一般ubuntu下面,chrome的目录是/opt/google/chrome/ 据说使用root权限运行的话,chrome的headless浏览器会报异常.而firefox的headless浏览器不会! """ if browser_class == 1: profile = FirefoxProfile() profile.set_preference("intl.accept_languages", "zh-cn") options = FirefoxOptions() options.add_argument("--headless") if headless: try: browser = Firefox(firefox_profile=profile, executable_path=firefox_driver, firefox_options=options) except Exception as e: title = "{} Firefox headless浏览器打开失败".format( datetime.datetime.now()) content = "错误原因是:{}".format(e) send_mail(title=title, content=content) logger.exception(e) raise e else: try: browser = Firefox( firefox_profile=profile, executable_path=firefox_driver, ) except Exception as e: title = "{} Firefox headless浏览器打开失败".format( datetime.datetime.now()) content = "错误原因是:{}".format(e) send_mail(title=title, content=content) logger.exception(e) raise e else: options = ChromeOptions() options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"]) if headless: options.add_argument("--headless") try: browser = Chrome(executable_path=chrome_driver, chrome_options=options) except Exception as e: title = "{} Chrome headless浏览器打开失败".format( datetime.datetime.now()) content = "错误原因是:{}".format(e) send_mail(title=title, content=content) logger.exception(e) raise e else: try: browser = Chrome(executable_path=chrome_driver, chrome_options=options) except Exception as e: title = "{} Chrome headless浏览器打开失败".format( datetime.datetime.now()) content = "错误原因是:{}".format(e) send_mail(title=title, content=content) # 这是我自定义的方法 logger.exception(e) raise e return browser
class Profiler: def __init__(self): self.chrome_options = ChromeOptions() self.chrome_options.add_argument("--headless") self.chrome_options.add_argument("--window-size=1920x1080") self.chrome_options.add_argument("--log-level=3") # Use the chrome driver in the same directory as this file, regardless # of what the current working directory is. filepath = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) self.chrome_driver = filepath + "/chromedriver_win.exe" self.driver = webdriver.Chrome(options=self.chrome_options, executable_path=self.chrome_driver) """Dispose of the driver window correctly when code exits""" def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.driver.close() def query_token_sniffer(self, address): #TODO: Refactor this to a simple HTTP request url = "https://tokensniffer.com/token/" + address self.driver.get(url) sleep(1) if "WARNING" in self.driver.page_source: return "SCAM" if "This page could not be found" in self.driver.page_source: return "404" return "OKAY" def query_poocoin(self,address): #Direct driver to Poocoin URL url = 'https://poocoin.app/tokens/' + address self.driver.get(url) # Await page load by querying a specific element max_delay = 10 try: myElem = WebDriverWait(self.driver, max_delay).until( EC.presence_of_element_located((By.CLASS_NAME, 'px-3'))) except TimeoutException: print("Loading took too much time!") return { "sell_exists": False, "v1_lp_address": "", "v2_lp_address": "", "v1_bnb_holdings": 0, "v2_bnb_holdings": 0, "market_cap": "$0" } sleep(1) #Get links to BSCScan for Liquidity Providers v1_lp_address = self.driver.find_element_by_xpath( "//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small/a[2]" ).get_attribute('href') v2_lp_address = self.driver.find_element_by_xpath( "//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small/a[4]" ).get_attribute('href') #Disgustingly parse out the BNB holdings in V1 and V2 LPs bnb_lp_values = self.driver.find_element_by_xpath("//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small").text values = bnb_lp_values.split('BNB') v1_bnb = float(re.sub("[^0-9.]", "", values[0].replace("V1",""))) v2_bnb = float(re.sub("[^0-9.]", "", values[1].replace("V2","").split(":")[1])) market_cap = self.driver.find_element_by_xpath("//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/span[1]").text #Determine if any Sell transactions have taken place try: tx_table = self.driver.find_element_by_xpath( "//*[@id='root']/div/div[1]/div[2]/div/div[2]/div[2]/div/div[3]/div[1]/div/div[2]") sell_txs = bool(tx_table.text.count("Sell")) except: sell_txs = False return { "sell_exists": sell_txs, "v1_lp_address": v1_lp_address, "v2_lp_address": v2_lp_address, "v1_bnb_holdings": v1_bnb, "v2_bnb_holdings": v2_bnb, "market_cap": market_cap } def query_bscscan_token(self, address): # Direct driver to given token URL url = 'https://bscscan.com/token/' + address self.driver.get(url) # Await page load by querying a specific element max_delay = 25 try: myElem = WebDriverWait(self.driver, max_delay).until( EC.presence_of_element_located((By.ID, 'totaltxns'))) except TimeoutException: print("FAIL - Loading took too much time!") return { "num_transactions": 0, "num_holders" : 0, "age" : datetime.now(), "tx_df": pd.DataFrame(), } sleep(0.5) # Extract total number of transactions transactions = self.driver.find_element_by_id("totaltxns").text num_transactions = int(re.sub("[^0-9]", "", transactions)) # Extract number of token holders holders = self.driver.find_element_by_class_name("mr-3").text num_holders = int(re.sub("[^0-9]", "", holders)) # Focus TX table WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it( (By.XPATH, "//*[@id='tokentxnsiframe']"))) age_col = self.driver.find_element_by_xpath("//*[@id='lnkTokenTxnsAgeDateTime']").text if age_col == "Age": # Switch DateTime format age_elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='lnkTokenTxnsAgeDateTime']"))).click() # Select Last Page of TXs (may not exist if 1 page only) try: WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='maindiv']/div[1]/nav/ul/li[5]/a/span[1]"))).click() except: pass # Parse raw HTML with BeautifulSoup soup = BeautifulSoup(self.driver.page_source, features="html.parser") # Scrape HTML table table_data = soup.find( "table", {"class": "table table-md-text-normal table-hover mb-4"}) tx_df = pd.read_html(str(table_data))[0] tx_df.dropna(axis=1, how='all', inplace=True) #Get Age of token (first tx datetime) # print(df) tx_df["Date Time (UTC)"] = pd.to_datetime(tx_df["Date Time (UTC)"]) earliest_tx = tx_df["Date Time (UTC)"].min() #TODO: Hunt for Whales #Switch to Holders table tab # self.driver.get(url+"#balances") # WebDriverWait(self.driver, 25).until(EC.frame_to_be_available_and_switch_to_it( # (By.XPATH, "//*[@id='tokeholdersiframe']"))) # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH,"//*[@id='ContentPlaceHolder1_tabHolders']"))).click() # Focus Holders table # WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it( # (By.XPATH, "//*[@id='tokeholdersiframe']"))) # sleep(5) # #Holy shit this is gross # #Find contract icon by <i> -> <span> -> <td> -> <tr> -> <td>rowKey</td> # icons = self.driver.find_elements_by_class_name("fa-file-alt") # icons = [i.find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..') # .get_attribute('innerHTML')[:10] for i in icons] # #Then parse out the <td></td> HTML tags to get the row number # contract_rows = [int(re.sub("[^0-9]", "", i))-1 for i in icons] # # Parse raw HTML with BeautifulSoup # soup = BeautifulSoup(self.driver.page_source, features="html.parser") # # Scrape HTML table # table_data = soup.find( # "table", {"class": "table table-md-text-normal table-hover"}) # holders_df = pd.read_html(str(table_data))[0] # holders_df.dropna(axis=1, how='all', inplace=True) # # Boolean for IsContractAddress, indicated by the icon on BSCscan # holders_df["is_contract_address"] = False # holders_df.loc[contract_rows, "is_contract_address"] = True # "holders_df": holders_df return { "num_transactions": num_transactions, "num_holders" : num_holders, "age" : earliest_tx, "tx_df": tx_df, } def query_bscscan_liquidity_providers(self, url): # Direct driver to given LP URL self.driver.get(url) # Await page load by querying a specific element max_delay = 25 try: myElem = WebDriverWait(self.driver, max_delay).until( EC.presence_of_element_located((By.CLASS_NAME, 'mr-3'))) except TimeoutException: return pd.DataFrame sleep(1) # Extract number of token holders holders = self.driver.find_element_by_class_name("mr-3").text num_lp_holders = int(re.sub("[^0-9]", "", holders)) # Focus holders table WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it( (By.XPATH, "//*[@id='tokeholdersiframe']"))) sleep(1) #Holy shit this is gross #Find contract icon by <i> -> <span> -> <td> -> <tr> -> <td>rowKey</td> icons = self.driver.find_elements_by_class_name("fa-file-alt") icons = [i.find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..') .get_attribute('innerHTML')[:10] for i in icons] #Then parse out the <td></td> HTML tags to get the row number contract_rows = [int(re.sub("[^0-9]", "", i))-1 for i in icons] # Parse raw HTML with BeautifulSoup soup = BeautifulSoup(self.driver.page_source, features="html.parser") # Scrape HTML table table_data = soup.find( "table", {"class": "table table-md-text-normal table-hover"}) df = pd.read_html(str(table_data))[0] df.dropna(axis=1, how='all', inplace=True) # Boolean for IsContractAddress, indicated by the icon on BSCscan df["is_contract_address"] = False df.loc[contract_rows, "is_contract_address"] = True #Stick on the number of holders lol df["num_lp_holders"] = num_lp_holders return df def profile_token(self, address): #Start by querying Poocoin to get BSCScan LP links poocoin_stats = self.query_poocoin(address) # TODO: Exit early if no liquidity if poocoin_stats["v1_bnb_holdings"] < 1 and poocoin_stats["v2_bnb_holdings"] < 1: poocoin_stats["locked_liquidity"] = 0 poocoin_stats["tx_df"] = pd.DataFrame() poocoin_stats["stats"] = { "age": pd.Timestamp.now()} poocoin_stats["token_sniffer"] = "404" return poocoin_stats #Query token on BSCScan bscscan_stats = self.query_bscscan_token(address) # Query Liquidity Provider holders on BSCScan # [Rank, Address, Quantity, Percentage, is_contract_address] v1_lp_holders = self.query_bscscan_liquidity_providers(poocoin_stats["v1_lp_address"]) v2_lp_holders = self.query_bscscan_liquidity_providers(poocoin_stats["v2_lp_address"]) def check_locked_liquidty(df, liquidity_value): if "There are no matching entries" == df["Percentage"].iloc[0]: return 0 #Find real value of liquidty per address (in BNB) df["percent_float"] = df["Percentage"].apply(lambda x: float(''.join(i for i in x if i not in '%,'))/100) df = df[df["percent_float"] <= 100] df["bnb_value"] = df["percent_float"] * liquidity_value total_locked = 0 # Check if liquidity is sufficient + locked dead_address = "0x000000000000000000000000000000000000dead" if dead_address in df["Address"]: total_locked += sum(df[df["Address"]==dead_address]["bnb_value"]) # contract_addresses = df[df["is_contract_address"]==True] # total_locked += sum(contract_addresses["bnb_value"]) return total_locked #Calculate locked liquidity total_locked = 0 if not v1_lp_holders.empty: total_locked += check_locked_liquidty(v1_lp_holders, poocoin_stats["v1_bnb_holdings"]) if not v2_lp_holders.empty: total_locked+= check_locked_liquidty(v2_lp_holders, poocoin_stats["v2_bnb_holdings"]) # Return full dictionary profile = poocoin_stats profile['v1_lp_holders'] = v1_lp_holders profile['v2_lp_holders'] = v2_lp_holders profile['stats'] = bscscan_stats profile['token_sniffer'] = self.query_token_sniffer(address) profile['locked_liquidity'] = total_locked return profile
import random from selenium.webdriver.chrome.options import Options import requests chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC #防止被屏蔽 from selenium.webdriver import ChromeOptions option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_experimental_option('useAutomationExtension', False) option.add_argument( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" ) bro = webdriver.Chrome() with open('C:\\Users\Administrator\Desktop/stealth.min.js') as f: js = f.read() bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js}) bro.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", {
def _pwd_login(self) -> bool: res = False try: if platform == 'linux' or platform == 'linux2': driver_path = webdriver_path_debian elif platform == 'win32': driver_path = webdriver_path_win chrome_options = ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('blink-settings=imagesEnabled=false') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-dev-shm-usagenmsbsohu123') driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path) driver.get('https://mail.aliyun.com/alimail/auth/login') wait = WebDriverWait(driver, 10) wait.until(lambda diver: driver.find_element_by_xpath('//iframe[@id="alibaba-login-box"]'), message='load login page fail!') driver.switch_to.frame('alibaba-login-box') account = self.task.account.split('@')[0] driver.find_element_by_xpath('//input[@id="fm-login-id"]').send_keys(account) driver.find_element_by_xpath('//input[@id="fm-login-password"]').send_keys(self.task.password) driver.find_element_by_xpath('//input[@id="fm-login-submit"]').click() wait.until(lambda diver: driver.find_element_by_xpath("//*[text()='我的邮箱']"), message='enter homepage fail!') cookies = '' for cookie in driver.get_cookies(): cookies = cookies + cookie['name'] + '=' + cookie['value'] + ';' driver.quit() self._ha._managedCookie.add_cookies('aliyun.com', cookies) res = self._cookie_login() except Exception as ex: self._logger.error("Pwd login error, err: {}".format(ex)) self._write_log_back("账密登录失败: {}".format(ex.args)) return res
from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium import webdriver from selenium.webdriver import ChromeOptions import math import time def calc(x): return str(math.log(abs(12 * math.sin(int(x))))) try: link = "http://suninjuly.github.io/explicit_wait2.html" options = ChromeOptions() options.add_argument("--start-maximized") browser = webdriver.Chrome(options=options) browser.get(link) # говорим Selenium проверять в течение 5 секунд, пока кнопка не станет кликабельной price = WebDriverWait(browser, 12).until( EC.text_to_be_present_in_element((By.XPATH, "//*[@id='price']"), "100") ) button = browser.find_element_by_xpath("//*[@id='book']") button.click() x_element = browser.find_element_by_xpath("//*[@id='input_value']") x = x_element.text
def _start_browser(self): assert self.browser is None, "Browser must not exist in order to call _start_browser!" # Load a user profile from normal chrome user_profile = "C:\\Users\\Alex Thiel\\AppData\\Local\\Google\\Chrome\\User Data\\Default" # Options options = Options() options.add_argument("user-data-dir={}".format(user_profile)) options.add_experimental_option("excludeSwitches", [ "ignore-certificate-errors", "safebrowsing-disable-download-protection", "safebrowsing-disable-auto-update", "disable-client-side-phishing-detection" ]) os.environ["webdriver.chrome.driver"] = self.driver_path # Add variation to the browser if randint(0, 2) == 1: options.add_argument("--incognito") print("Option: Incognito") if randint(0, 2) == 1: options.add_argument("--disable-extensions") print("Option: Disabling Extensions") if randint(0, 2) == 1: options.add_argument("--disable-plugins-discovery") print("Option: Disabling plugins discovery") if randint(0, 2) == 1: options.add_argument('--no-referrers') print("Option: No Referrers") if randint(0, 2) == 1: options.add_argument('--disable-web-security') print("Option: Disabled web security") if randint(0, 2) == 1: options.add_argument('--allow-running-insecure-content') print("Option: Allowing running insecure content") if randint(0, 2) == 1: options.add_experimental_option( 'prefs', { 'credentials_enable_service': False, 'profile': { 'password_manager_enabled': False } }) print("Options: Disabled Password Manager") # options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2}) agent = UserAgent().random options.add_argument("user-agent=" + agent) self.current_agent = agent print("Option: Agent:", agent) # Open up browser window self.browser = Driver(executable_path=self.driver_path, chrome_options=options) self.browser.set_page_load_timeout(self.cfg.browser_timeout) self.browser.delete_all_cookies() if randint(0, 2) == 1: print("Option: Start Maximized") self.browser.maximize_window() else: self.browser.set_window_size(randint(700, 1080), randint(700, 1080)) self.browser.set_window_position(randint(0, 300), randint(0, 300))
def get_driver(self, name='chrome', type='headless'): # todo:内存泄漏问题;各个浏览器配置管理 self._instance_lock.acquire() if name in self._driver.keys(): return self._driver[name] self._instance_lock.release() deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home') if name == 'phantomjs': dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (random.choice( consts.USER_AGENTS)) dcap["phantomjs.page.settings.loadImages"] = False driver_phantomjs = webdriver.PhantomJS( desired_capabilities=dcap, executable_path=deploy_home + '/src/config/phantomjs') self._driver[name] = driver_phantomjs return driver_phantomjs elif name == 'chrome': opts = ChromeOptions() opts.add_argument('--no-sandbox') opts.add_argument('--disable-dev-shm-usage') dcap = dict(DesiredCapabilities.CHROME) dcap["chrome.page.settings.loadImages"] = False if type == 'headless': opts.add_argument("--headless") chrome_driver = webdriver.Chrome( desired_capabilities=dcap, executable_path=deploy_home + ConfigInit().get_config_by_option('chrome_path'), chrome_options=opts) self._driver[name] = chrome_driver return chrome_driver elif name == 'firefox': opts = FirefoxOptions() if type == 'headless': opts.add_argument("--headless") firefox_driver = webdriver.Firefox(executable_path=deploy_home + '/src/config/geckodriver_mac', firefox_options=opts) self._driver[name] = firefox_driver return firefox_driver
from selenium.webdriver import Chrome, ChromeOptions import time import pymysql import sys import spider_foreign_sl from time import * option = ChromeOptions() # 创建配置实例 option.add_argument('--headless') #在后台启动 option.add_argument('--no-sandbox') option.add_argument('--disable-dev-shm-usage') option.add_argument('blink-settings=imagesEnabled=false') option.add_argument('--disable-gpu') # 创建浏览器 schoolbrowser = Chrome(executable_path="/home/baize/Chrome/chromedriver", options=option) # (options=option) #给出待爬取的学校url schoolUrl = 'https://www.researchgate.net/institution/University_of_Chicago/departments' schoolbrowser.get(schoolUrl) # 打开具体的网页 sleep(2) #链接数据库 conn = pymysql.connect(host="39.106.96.175", port=3306, db="scholar_info", user="******", password="******", charset="utf8") cls = conn.cursor() schoolget = schoolUrl.split("/")[-2].replace('%20', '_') # 按学校建表
import time from selenium.webdriver import Chrome from selenium.webdriver import ChromeOptions from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select url = "http://www.calculator.net/interest-calculator.html" chrome_path = '/home/xuananh/data/Downloads/chromedriver_linux64/chromedriver' # chrome driver download from link: https://chromedriver.storage.googleapis.com/index.html?path=2.44/ chrome_option = ChromeOptions() driver = Chrome(executable_path=chrome_path, chrome_options=chrome_option) driver.maximize_window() driver.get(url) # selecting an item from Drop Down list Box drop_down = Select(driver.find_element_by_id("ccompound")) drop_down.select_by_visible_text("monthly") # cach 2 de chon 1 item # driver.find_element_by_id("ccompound").click() # driver.find_element_by_xpath('//option[@value="monthly"]').click() # ban cung co the su dung cac phuong thuc sau: # drop_down.select_by_index(1) # drop_down.select_by_value("continuously") print('Is Selected: ', driver.find_element_by_id('ccompound').is_selected()) print('Is Enabled: ', driver.find_element_by_id('ccompound').is_enabled()) print('Is Displayed: ', driver.find_element_by_id('ccompound').is_displayed()) print('text: ', drop_down.first_selected_option.text)
def get_amzn_driver(email, password, headless=False, session_path=None): zip_type = "" executable_path = os.path.join(os.getcwd(), 'chromedriver') if _platform in ['win32', 'win64']: executable_path += '.exe' zip_type = CHROME_ZIP_TYPES.get(_platform) if not os.path.exists(executable_path): zip_file_url = CHROME_DRIVER_BASE_URL.format( CHROME_DRIVER_VERSION, zip_type) request = requests.get(zip_file_url) if request.status_code != 200: raise RuntimeError( 'Error finding chromedriver at {}, status = {}'.format( zip_file_url, request.status_code)) zip_file = zipfile.ZipFile(io.BytesIO(request.content)) zip_file.extractall() os.chmod(executable_path, 0o755) chrome_options = ChromeOptions() if headless: chrome_options.add_argument('headless') chrome_options.add_argument('no-sandbox') chrome_options.add_argument('disable-dev-shm-usage') chrome_options.add_argument('disable-gpu') # chrome_options.add_argument("--window-size=1920x1080") if session_path is not None: chrome_options.add_argument("user-data-dir=" + session_path) logger.info('Logging into Amazon.com') driver = Chrome(chrome_options=chrome_options, executable_path=executable_path) driver.get(ORDER_HISTORY_URL_VIA_SWITCH_ACCOUNT_LOGIN) driver.implicitly_wait(2) def get_element_by_id(driver, id): try: return driver.find_element_by_id(id) except NoSuchElementException: pass return None def get_element_by_xpath(driver, xpath): try: return driver.find_element_by_xpath(xpath) except NoSuchElementException: pass return None # Go straight to the account switcher, and look for the given email. # If present, click on it! Otherwise, click on "Add account". desired_account_element = get_element_by_xpath( driver, "//div[contains(text(), '{}')]".format(email)) if desired_account_element: desired_account_element.click() driver.implicitly_wait(2) # It's possible this account has already authed recently. If so, the # next block will be skipped and the login is complete! if not get_element_by_id(driver, 'report-confirm'): driver.find_element_by_id('ap_password').send_keys( get_password(password)) driver.find_element_by_name('rememberMe').click() driver.find_element_by_id('signInSubmit').submit() else: # Cannot find the desired account in the switch. Log in via Add Account driver.find_element_by_xpath( '//div[text()="Add account"]').click() driver.implicitly_wait(2) driver.find_element_by_id('ap_email').send_keys(email) # Login flow sometimes asks just for the email, then a # continue button, then password. if get_element_by_id(driver, 'continue'): driver.find_element_by_id('continue').click() driver.implicitly_wait(2) driver.find_element_by_id('ap_password').send_keys( get_password(password)) driver.find_element_by_name('rememberMe').click() driver.find_element_by_id('signInSubmit').submit() driver.implicitly_wait(2) if not get_element_by_id(driver, 'report-confirm'): logger.warning('Having trouble logging into Amazon. Please see the ' 'browser and complete login within the next 5 minutes. ' 'This script will continue automatically on success. ' 'You may need to manually navigate to: {}'.format( ORDER_HISTORY_REPORT_URL)) if get_element_by_id(driver, 'auth-mfa-otpcode'): logger.warning('Hint: Looks like an auth challenge! Maybe check ' 'your email') try: wait_cond = EC.presence_of_element_located((By.ID, 'report-confirm')) WebDriverWait(driver, 60 * 5).until(wait_cond) except TimeoutException: logger.critical('Cannot complete login!') exit(1) return driver
def spiderweibo(keywords, timepara, province, city, district, locpara, resultfile): # 设置浏览器自动登陆微博 option = ChromeOptions() prefs = { 'profile.default_content_setting_values': { 'images': 2, 'javascript': 1 } } #禁止加载图片与JS option.add_experimental_option('prefs', prefs) #option.add_argument('--headless')#不显示浏览器窗口 #options.headless = True #option.add_argument('--disable-gpu') #for windows not for macos option.add_argument( r"user-data-dir=/Users/Yokimsu/Library/Caches/Google/Chrome/User Data/Profile 1" ) # 浏览器缓存位置 browser = Chrome("/Users/YokimSu/chromedriver", 0, options=option) url0 = "https://s.weibo.com/weibo?q=" + keywords + locpara + "&typeall=1&suball=1×cope=custom:" + timepara #browser.minimize_window()#最小化窗口 browser.get(url0) time.sleep(3) soup = BeautifulSoup(browser.page_source, 'lxml') loginYN = "" try: loginYN = soup.find('div', { 'class': 'm-hint' }).find('a', {'action-type': 'login'}) loginYN = "N" print("未登录,程序终止,请重新登录后在操作!") except: pass try: pagenum = len(soup.find('ul', {'class': 's-scroll'}).find_all('li')) #print(pagenum) print("-------------------------") print("总页数%d" % pagenum) # 如果等于50页,有可能超出微博显示数量,后期加上省份选项,重新爬取 if pagenum == 50: with open('超过50页.csv', 'a', encoding='utf8', newline='') as f: current = time.time() current = time.localtime(current) timestr = "" + str(current.tm_year) + "/" + str( current.tm_mon) + "/" + str(current.tm_mday) + " " + str( current.tm_hour) + ":" + str( current.tm_min) + ":" + str(current.tm_sec) writer = csv.writer(f) writer.writerow((timestr, pagenum, url0, keywords, timepara, province, city, district, locpara)) except: # 只有一页是会报错 pagenum = 1 print("-------------------------") print("总页数%d" % pagenum) #判断是否有相关内容 yn = "" try: yn = soup.find('div', { 'class': 'card card-no-result s-pt20b40' }).find('p').get_text().strip() pagenum = 1 #此段是将没有搜索结果的参数也写入csv结果文件中,方便查看哪些字段信息搜索无结果 """ with open(resultfile, 'a', encoding='utf8', newline='') as f: current = time.time() current = time.localtime(current) timestr = "" + str(current.tm_year) + "/" + str(current.tm_mon) + "/" + str(current.tm_mday) + " " + str(current.tm_hour) + ":" + str(current.tm_min) + ":" + str(current.tm_sec) writer = csv.writer(f) writer.writerow((timestr, url0,timepara, province,city,district,yn,"", "","","", "", "", "", "","","", "","","","","","","","","","","")) """ print(url0 + "----此页无结果!") except: yn = "" if pagenum <= 50: for page in range(0, pagenum): try: # 用浏览器打开微博话题页面 url = url0 + "&Refer=g&page=" + str(page + 1) browser.get(url) #只有时间筛选项的url """ browser.get( "https://s.weibo.com/weibo?q=" + keywords + "&typeall=1&suball=1×cope=custom:" + timepara + "&Refer=g&page=" + str(page + 1)) """ time.sleep(1) # 调到网页内容 browser.switch_to_default_content() # 读取浏览器网页内容 soup = BeautifulSoup(browser.page_source, 'lxml') # 每个用户的信息及评论存放在此列表里的每个元素里 allinfo = soup.find_all('div', {'class': 'card'}) for eachitem in allinfo: if eachitem.find('a', {'class': 'name'}): # 用户名信息 try: username = eachitem.find( 'a', { 'class': 'name' }).get_text().strip() # username except: username = "******" # 用户主页 try: userlink = "https://www." + eachitem.find( 'a', { 'class': 'name' }).get('href').replace( "/member/", "").replace( "//", "").strip() # 用户主页 except: userlink = ("None") # 微博内容 try: contents = eachitem.find( 'p', { 'class': 'txt' }).get_text().strip() # 微博内容 except: contents = "None" try: location_temp = len( eachitem.find('p', { 'class': 'txt' }).find_all('a')) except: location_temp = 1 try: location = eachitem.find( 'p', { 'class': 'txt' }).find_all('a')[location_temp - 1].get_text().replace( "2", "").strip() # 定位 location_link = eachitem.find( 'p', { 'class': 'txt' }).find_all('a')[location_temp - 1].get( 'href').strip() # 定位 except: location = "None" location_link = "None" #print(location,location_link) # 判断转发来源,此段暂时不需要 """ try: pattern = re.compile(r'//@.*?:', re.I) resulttemp = pattern.findall(contents) source = resulttemp[0] source = source.replace("//@", "").replace(":", "") #print(source) except: source = "None" if source == "None": try: source = eachitem.find('div', {'class': 'card-comment'}).find('div', { 'node-type': 'feed_list_forwardContent'}).find('a').get_text().strip() # 转发来源 except: pass """ # 发表时间 try: tempnum = len( eachitem.find_all('p', {'class': 'from'})) except: tempnum = 1 if tempnum > 1: try: post_date = eachitem.find_all( 'p', {'class': 'from'})[1].find_all( "a")[0].get_text().strip() # 时间 contents_link = "https://www." + eachitem.find_all( 'p', {'class': 'from' })[1].find_all("a")[0].get( 'href').strip().replace( "//", "") try: post_date_date = post_date.split( " ", 1)[0].replace("年", "/").replace( "月", "/").replace("日", "") post_date_time = post_date.split( " ", 1)[1] except: post_date_date = "None" post_date_time = "None" except: post_date = "None" contents_link = "None" post_date_date = "None" post_date_time = "None" else: try: post_date = eachitem.find_all( 'p', {'class': 'from'})[0].find_all( "a")[0].get_text().strip() # 时间 contents_link = "https://www." + eachitem.find_all( 'p', {'class': 'from' })[0].find_all("a")[0].get( 'href').strip().replace( "//", "") try: post_date_date = post_date.split( " ", 1)[0].replace("年", "/").replace( "月", "/").replace("日", "") post_date_time = post_date.split( " ", 1)[1] except: post_date_date = "None" post_date_time = "None" except: post_date = "None" contents_link = "None" post_date_date = "None" post_date_time = "None" if post_date.find("今天") > -1: continue # 发布来源 try: pingtai = "【来自】" + eachitem.find( 'p', { 'class': 'from' }).find_all( "a")[1].get_text().strip() # 来自 except: pingtai = "None" # 收藏数 try: favorite_num = eachitem.find( 'div', { 'class': 'card-act' }).find_all("a")[0].get_text().replace( "收藏", "").strip() except: favorite_num = "None" # 转发数 try: repost_num = eachitem.find( 'div', { 'class': 'card-act' }).find_all("a")[1].get_text().replace( "转发", "").strip() except: repost_num = "None" # 评论数 try: comments_num = eachitem.find( 'div', { 'class': 'card-act' }).find_all("a")[2].get_text().replace( "评论", "").strip() except: comments_num = "None" # 点赞数 try: reward_num = eachitem.find( 'div', { 'class': 'card-act' }).find_all("a")[3].get_text().strip() except: reward_num = "None" #解析转发的原微博内容 try: username2 = eachitem.find( 'div', { 'class': 'card-comment' }).find('a').get('nick-name').strip() #用户名 except: username2 = "None" try: userlink2 = "https://www." + eachitem.find( 'div', { 'class': 'card-comment' }).find('a').get('href').strip().replace( "//", "") #用户主页链接 except: userlink2 = "None" #print(username2,userlink2) try: contents2 = eachitem.find( 'div', { 'class': 'card-comment' }).find('p', { 'class': 'txt' }).get_text().strip() #微博内容 except: contents2 = "None" try: post_date2 = eachitem.find( 'div', { 'class': 'card-comment' }).find('p', { 'class': 'from' }).find_all( "a")[0].get_text().strip() #发布时间 except: post_date2 = "None" try: contents2_link = "https://www." + eachitem.find( 'div', { 'class': 'card-comment' }).find('p', { 'class': 'from' }).find_all("a")[0].get( 'href').strip().replace("//", "") except: contents2_link = "None" try: pingtai2 = "【来自】" + eachitem.find( 'div', { 'class': 'card-comment' }).find('p', { 'class': 'from' }).find_all( "a")[1].get_text().strip() # 来自 except: pingtai2 = "None" try: tempinfo2 = eachitem.find( 'div', { 'class': 'card-comment' }).find('div', { 'class': 'func' }).find('ul', { 'class': 'act s-fr' }).find_all('li') except: tempinfo2 = "None" try: repost_num2 = tempinfo2[0].get_text().replace( "转发", "").strip() except: repost_num2 = "None" try: comments_num2 = tempinfo2[1].get_text( ).replace("评论", "").strip() except: comments_num2 = "None" try: reward_num2 = tempinfo2[2].get_text().strip() except: reward_num2 = "None" with open(resultfile, 'a', encoding='utf8', newline='') as f: current = time.time() current = time.localtime(current) timestr = "" + str( current.tm_year ) + "/" + str(current.tm_mon) + "/" + str( current.tm_mday) + " " + str( current.tm_hour) + ":" + str( current.tm_min) + ":" + str( current.tm_sec) writer = csv.writer(f) writer.writerow( (timestr, url, timepara, province, city, district, yn, username, userlink, contents, location, location_link, post_date, post_date_date, post_date_time, pingtai, favorite_num, repost_num, comments_num, reward_num, contents_link, username2, userlink2, contents2, post_date2, pingtai2, repost_num2, comments_num2, reward_num2, contents2_link)) except Exception as e: print("error", str(e)) print("----------") #print(page+1,pagenum,timepara) print("第%d/%d页爬取完成!时间参数:%s" % ((page + 1), pagenum, timepara)) browser.quit() return loginYN
def launch_application(browser_name, app_url): global driver log.info("in init method of selenium base") try: if browser_name == "chrome": option = ChromeOptions() option.add_argument("start-maximized") option.add_argument("--ignore-certificate-errors") option.add_argument("--disable-extensions") option.add_argument("--disable-infobars") option.add_argument("disable-notifications") driver = Chrome(executable_path="./drivers/chromedriver.exe", options=option) log.info("chrome browser is launch successfully") elif browser_name == "firefox": profile = FirefoxProfile() profile.accept_untrusted_certs = True options = FirefoxOptions() options.add_argument("start-maximized") driver = Firefox(executable_path="./drivers/geckodriver.exe") log.info("firefox browser is launch successfully") elif browser_name == "ie": driver = Ie(executable_path="./drivers/IEDriverServer.exe") else: log.error("browser name is incorrect", browser_name) except WebDriverException: log.critical("exception", WebDriverException) driver.implicitly_wait(5) driver.get(app_url)
def run(self): options = ChromeOptions() options.add_argument('--test-type') self.driver = Chrome(chrome_options=options) self.perform_steps() self.driver.close()
import sys from time import sleep from selenium import webdriver as drv from selenium.webdriver import ChromeOptions from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException from config import conf # init, talk about "eyes" opts = ChromeOptions() br = drv.Chrome(chrome_options=opts) br.implicitly_wait(15) conf["dom"] = br ################################### ################################### from utils import get_xpath, wait_for_xpath, wait_for_path, \ tracer, chat, err, gen_url, click_xpath, wait_for_xpath, \ wait_for_url, log from dom_tools import where_in_nextbox, goto_nextbox_nav, \ goto_nextbox, nextbox_sub_ensure from test_storages import storages_roundtrip, storages_backup_avail import pysnooper
def chromeoptions(self): options = ChromeOptions() options.add_argument('headless') options.add_argument('--log-level=3') options.add_argument('--disable-extensions') return options
import pymongo import time import json import random import hashlib from config import * client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] collec_mogu = db[COLLECTION_NAME] headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36' } opt = ChromeOptions() opt.headless = True browser = Chrome(options=opt) def request_html(url): response = requests.get(url, headers=headers) html = etree.HTML(response.text) return html def kind_ls(html): shop_ls = html.xpath( '//div[@class="item-wrap"]/div[1]/a[@class="cate-item-link"]/@href' )[:-1] for shop_url in shop_ls:
line_notify_token = '1NS7LLBpds6tZRmSEIlBH0m5YhQ3aUQit9nfJd6KveW' line_notify_api = 'https://notify-api.line.me/api/notify' #変数messageに文字列をいれて送信。 トークン名の隣に文字が来てしまうので最初に改行 message = '\n' + mes payload = {'message': message} headers = {'Authorization': 'Bearer ' + line_notify_token} line_notify = requests.post(line_notify_api, data=payload, headers=headers) # 例外が生じた時に、自分に通知する try: # 絶対パス取得 cwd = os.getcwd() #webdriverを使ってスクレイピング options = ChromeOptions() # ヘッドレスモードを有効にする(次driver = Chrome(executable_path='/chromedriver' ,options=options)の行をコメントアウトすると画面が表示される) options.add_argument('--headless') # no sandbox で実行しないとcronが使えなかった options.add_argument('--no-sandbox') # ChromeのWebDriverオブジェクトを作成する driver = Chrome(executable_path='/home/ubuntu/bin/chromedriver', chrome_options=options) # 指定した待ち時間の間、要素が見つかるまで(ロードされるまで)待機するように設定。短いとTimeoutExceptionになった。 driver.implicitly_wait(10) # atwikiのログインページなどはurlの直接入力で表示されないようになっているので、別のページからseleniumでログインページへ遷移する必要がある。 driver.get("https://www65.atwiki.jp/44teck/pages/1.html") print("サイトに接続中...") driver.find_element_by_link_text("ログイン").click() driver.find_element_by_name("user").send_keys(user) driver.find_element_by_name("pass").send_keys(passward)
from selenium import webdriver from time import sleep # 实现无可视化界面 from selenium.webdriver.chrome.options import Options # 实现规避检测 from selenium.webdriver import ChromeOptions # 实现无可视化界面的操作 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') # 实现规避检测 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) # 如何实现让selenium规避被检测到的风险 bro = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=chrome_options, options=option) # 无可视化界面(无头浏览器)-还有可以使用 phantomJs bro.get('https://www.baidu.com') print(bro.page_source) sleep(2) bro.quit()
def set_driver(driver_path, headless_flg): # Chromeドライバーの読み込み options = ChromeOptions() # ヘッドレスモード(画面非表示モード)をの設定 if headless_flg == True: options.add_argument('--headless') # 起動オプションの設定 options.add_argument( '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' ) #options.add_argument('log-level=3') options.add_argument('--ignore-certificate-errors') options.add_argument('--ignore-ssl-errors') options.add_argument('--incognito') # シークレットモードの設定を付与 # ChromeのWebDriverオブジェクトを作成する。 return Chrome(executable_path=os.getcwd() + "/" + driver_path, options=options)
import os import time from helium import * from selenium.webdriver import ChromeOptions TD_URL = os.getenv('TD_URL') TD_LOGIN = os.getenv('TD_LOGIN') TD_PASSWORD = os.getenv('TD_PASSWORD') AMEX_URL = os.getenv('AMEX_URL') AMEX_LOGIN = os.getenv('AMEX_LOGIN') AMEX_PASSWORD = os.getenv('AMEX_PASSWORD') options = ChromeOptions() options.add_argument("--disable-infobars") options.add_argument("--start-maximized") options.add_argument("--disable-extensions") options.add_argument('--disable-notifications') # https://stackoverflow.com/questions/38684175/how-to-click-allow-on-show-notifications-popup-using-selenium-webdriver options.add_experimental_option( "prefs", {"profile.default_content_setting_values.notifications": 2}) # chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument("--disable-notifications") # driver = webdriver.Chrome(options=chrome_options) # set_driver(driver) # get_driver() # # TD #
def get_web_driver(email, password, headless=False, mfa_method=None, mfa_input_callback=None, wait_for_sync=True, session_path=None, imap_account=None, imap_password=None, imap_server=None, imap_folder="INBOX"): if headless and mfa_method is None: warnings.warn("Using headless mode without specifying an MFA method" "is unlikely to lead to a successful login. Defaulting --mfa-method=sms") mfa_method = "sms" zip_type = "" executable_path = os.getcwd() + os.path.sep + 'chromedriver' if _platform in ['win32', 'win64']: executable_path += '.exe' zip_type = CHROME_ZIP_TYPES.get(_platform) if not os.path.exists(executable_path): zip_file_url = CHROME_DRIVER_BASE_URL % (CHROME_DRIVER_VERSION, zip_type) request = requests.get(zip_file_url) if request.status_code != 200: raise RuntimeError('Error finding chromedriver at %r, status = %d' % (zip_file_url, request.status_code)) zip_file = zipfile.ZipFile(io.BytesIO(request.content)) zip_file.extractall() os.chmod(executable_path, 0o755) chrome_options = ChromeOptions() if headless: chrome_options.add_argument('headless') chrome_options.add_argument('no-sandbox') chrome_options.add_argument('disable-dev-shm-usage') chrome_options.add_argument('disable-gpu') # chrome_options.add_argument("--window-size=1920x1080") if session_path is not None: chrome_options.add_argument("user-data-dir=%s" % session_path) driver = Chrome(chrome_options=chrome_options, executable_path="%s" % executable_path) driver.get("https://www.mint.com") driver.implicitly_wait(20) # seconds try: element = driver.find_element_by_link_text("Log In") except NoSuchElementException: # when user has cookies, a slightly different front page appears driver.implicitly_wait(0) # seconds element = driver.find_element_by_link_text("LOG IN") driver.implicitly_wait(20) # seconds element.click() time.sleep(1) email_input = driver.find_element_by_id("ius-userid") # It's possible that the user clicked "remember me" at some point, causing # the email to already be present. If anything is in the input, clear it # and use the provided email, just to be safe. # email_input.setAttribute("value", "") email_input.clear() email_input.send_keys(email) driver.find_element_by_id("ius-password").send_keys(password) driver.find_element_by_id("ius-sign-in-submit-btn").submit() # Wait until logged in, just in case we need to deal with MFA. while not driver.current_url.startswith( 'https://mint.intuit.com/overview.event'): # An implicitly_wait is also necessary here to avoid getting stuck on # find_element_by_id while the page is still in transition. driver.implicitly_wait(1) time.sleep(1) # bypass "Let's add your current mobile number" interstitial page try: skip_for_now = driver.find_element_by_id('ius-verified-user-update-btn-skip') skip_for_now.click() except (NoSuchElementException, StaleElementReferenceException, ElementNotVisibleException): pass driver.implicitly_wait(1) # seconds try: driver.find_element_by_id('ius-mfa-options-form') try: mfa_method_option = driver.find_element_by_id('ius-mfa-option-{}'.format(mfa_method)) mfa_method_option.click() mfa_method_submit = driver.find_element_by_id("ius-mfa-options-submit-btn") mfa_method_submit.click() if mfa_method == 'email' and imap_account: mfa_code = get_email_code(imap_account, imap_password, imap_server, imap_folder=imap_folder) else: mfa_code = (mfa_input_callback or input)("Please enter your 6-digit MFA code: ") mfa_code_input = driver.find_element_by_id("ius-mfa-confirm-code") mfa_code_input.send_keys(mfa_code) mfa_code_submit = driver.find_element_by_id("ius-mfa-otp-submit-btn") mfa_code_submit.click() except Exception: # if anything goes wrong for any reason, give up on MFA mfa_method = None warnings.warn("Giving up on handling MFA. Please complete " "the MFA process manually in the browser.") except NoSuchElementException: pass finally: driver.implicitly_wait(20) # seconds # Wait until the overview page has actually loaded, and if wait_for_sync==True, sync has completed. if wait_for_sync: try: # Status message might not be present straight away. Seems to be due # to dynamic content (client side rendering). status_message = WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located( (By.CSS_SELECTOR, ".SummaryView .message"))) WebDriverWait(driver, 5 * 60).until( lambda x: "Account refresh complete" in status_message.get_attribute('innerHTML') ) except (TimeoutException, StaleElementReferenceException): warnings.warn("Mint sync apparently incomplete after 5 minutes. Data " "retrieved may not be current.") else: driver.find_element_by_id("transaction") return driver
def set_chrome(): return Chrome(executable_path=ChromeDriverManager().install(), options=ChromeOptions())
def get_options(self) -> ChromeOptions: options = ChromeOptions() options.add_argument("--disable-extensions") options.add_argument("--disable-gpu") options.add_argument("--dns-prefetch-disable") options.add_argument("--enable-automation") options.add_argument("--enable-javascript") options.add_argument("--no-sandbox") options.add_argument("--page-load-strategy=normal") options.add_argument("--user-data-dir=" + os.path.abspath("./selenium")) # enable cookies options.add_argument("--profile-directory=Default") # options.add_argument("--remote-debugging-port=9222") options.add_argument("--headless") return options