def __init_browser(self): """ configure the web driver """ self.__options = webdriver.FirefoxOptions() self.__options.headless = True self.__options.accept_insecure_certs = True self.__geckodriver_binary = self.__args.geckodriver self.__firefox_binary = FirefoxBinary(self.__args.firefox) # Set firefox profile self.__profile = webdriver.FirefoxProfile() firefox_profile(self.__profile) if self.__browser is not None: self.__browser.close() if self.__args.console: self.__browser = webdriver.Firefox( options=self.__options, firefox_binary=self.__firefox_binary, firefox_profile=self.__profile, executable_path=self.__geckodriver_binary, log_path=os.path.devnull) self.__browser.set_window_size(1920, 1080) else: self.__browser = webdriver.Firefox( options=self.__options, firefox_binary=self.__firefox_binary, firefox_profile=self.__profile, executable_path=self.__geckodriver_binary, log_path=self.__args.log) self.__browser.set_window_size(1920, 1080) self.__wait = WebDriverWait(self.__browser, self.__args.timeout)
def build(cfg, fetch_driver=True): """ builds a selenium-webdriver object with the specified configuration :param cfg: Configuration object :param fetch_driver: bool (default=True) fetches driver binaries :return: selenium-wire Webdriver object """ if cfg.proxy is not None: options = cfg.proxy.create_options() else: options = {} if cfg.driver in WebDriver.FIREFOX_DRIVER_NAMES: d = webdriver.Firefox o = webdriver.FirefoxOptions() if cfg.profile is None: p = webdriver.FirefoxProfile() else: p = webdriver.FirefoxProfile(cfg.profile) p.set_preference("general.useragent.override", cfg.user_agent) p.set_preference("media.volume_scale", "0.0") ''' if cfg.proxy is not None: p = cfg.proxy.update_preferences(p) ''' elif cfg.driver in WebDriver.CHROME_DRIVER_NAMES: d = webdriver.Chrome o = webdriver.ChromeOptions() o.add_argument("user-agent={0}".format(cfg.user_agent)) if cfg.user_data_dir: o.add_argument("user-data-dir={0}".format(cfg.user_data_dir)) ''' if cfg.proxy is not None: o.add_argument("--proxy-server={0}".format(cfg.proxy.for_chrome())) ''' p = None else: raise NotImplementedError if fetch_driver: Loader.fetch(cfg.executable_path, cfg.debug, cfg.driver) o.binary_location = cfg.executable_path o.headless = cfg.headless if cfg.driver in WebDriver.FIREFOX_DRIVER_NAMES: if cfg.proxy is None: return d(p, cfg.binary, options=o) else: '''return d(p, cfg.binary, options=o, proxy=cfg.proxy, seleniumwire_options=options)''' return d(p, cfg.binary, options=o, seleniumwire_options=options) elif cfg.driver in WebDriver.CHROME_DRIVER_NAMES: if cfg.proxy is None: return d(options=o) else: '''return d(options=o, proxy=cfg.proxy, seleniumwire_options=options)''' return d(options=o, seleniumwire_options=options)
def get_driver(): # Driver options options = webdriver.FirefoxOptions() options.set_preference("general.useragent.override", user_agent) options.set_preference("dom.webdriver.enabled", False) # options.headless = True return webdriver.Firefox(executable_path='path_to_geckodriver', options=options)
def launch_browser(headers=None, user_agent=None, proxy=None, browser_type="Firefox"): options = {} if proxy: proxy = { "http": proxy, "https": proxy, } options["proxy"] = proxy if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): directory = sys._MEIPASS else: directory = os.path.dirname(__file__) driver = None if browser_type == "Firefox": matches = ["geckodriver.exe", "geckodriver"] driver_paths = list( map(lambda match: os.path.join(directory, match), matches)) found_paths = [ driver_path for driver_path in driver_paths if os.path.exists(driver_path) ] if found_paths: driver_path = found_paths[0] opts = webdriver.FirefoxOptions() # opts.add_argument("--headless") profile = webdriver.FirefoxProfile() if not user_agent: user_agent = generate_user_agent() profile.set_preference("general.useragent.override", user_agent) driver = webdriver.Firefox( firefox_profile=profile, executable_path=driver_path, options=opts, seleniumwire_options=options, ) else: message = f"Download geckodriver from https://github.com/mozilla/geckodriver/releases/tag/v0.27.0 and paste it in {directory}" input(message) else: driver_path = os.path.join(directory, "chromedriver.exe") opts = webdriver.ChromeOptions() opts.add_argument(f"--proxy-server={opts}") driver = webdriver.Chrome(executable_path=driver_path, options=opts, seleniumwire_options=options) if not driver: input("DRIVER NOT FOUND") exit(0) driver.set_window_size(1920, 1080) browser = driver if headers: browser._client.set_header_overrides(headers=headers) return browser
def get_firefox_options(heroku=False): options = webdriver.FirefoxOptions() options.add_argument("--headless") options.add_argument("window-size=500x1024") options.add_argument( "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0" ) if heroku: # HEROKU """options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") options.add_argument("--disable-dev-shm-usage") options.add_argument("--no-sandbox") driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), options=options)""" pass return options
def get_driver(config_params): """ create a new hidden FireFox driver with proxy :return: driver """ options = webdriver.FirefoxOptions() options.headless = True # make it hidden profile = webdriver.FirefoxProfile() proxy_host = config_params['proxy_host'] proxy_port = config_params['proxy_port'] profile.set_preference('network.proxy.https', proxy_host) profile.set_preference('network.proxy.https_port', proxy_port) created_driver = webdriver.Firefox(firefox_options=options, firefox_profile=profile) created_driver.implicitly_wait(7) return created_driver
def __init__(self, proxy=None, headless=True, wait_increment=WAIT_INCREMENT, id=None): # set the sleep increment self.wait_increment = wait_increment # associate it with a streamer self.id = id # create a webdriver to work with options = webdriver.FirefoxOptions() options.add_argument('--no-sandbox') options.add_argument('--window-size=1420,1080') options.add_argument('--disable-gpu') options.add_argument('--disable-notifications') options.add_argument('--dns-prefetch-disable') options.add_argument('--disable-dev-shm-usage') if headless: options.add_argument('--headless') # add a proxy if available prox_options = None if proxy: my_proxy = f"{proxy['username']}:{proxy['password']}@{proxy['host']}:{proxy['port']}" # authenticated prox_options = { 'proxy': { 'http': f"http://{my_proxy}", 'https': f"https://{my_proxy}", 'no_proxy': 'localhost,127.0.0.1,dev_server:8080' } } self.driver = webdriver.Firefox(options=options, seleniumwire_options=prox_options)
def init_driver(self, driver_path): seleniumwire_options = {} # _proxy = get_proxy() _proxy = None if _proxy: seleniumwire_options.update( {'proxy': { 'http': _proxy, 'https': _proxy, 'no_proxy': '' }}) options = webdriver.FirefoxOptions() options.headless = config.headless # True - окно скрыто, False - окно не скрыто self.driver = webdriver.Firefox( options=options, seleniumwire_options=seleniumwire_options, executable_path=driver_path) self.driver.set_window_position(0, 0) # ставим окно в левый верхний угол self.driver.set_window_size(100, 300) # устанавливаем фиксированный размер self.driver.install_addon( os.path.abspath('extensions/anticaptcha-plugin_v0.52.xpi') ) # устанавливаем плагин который решает капчу # вставляем ключ в решатель капчи self.driver.get('https://antcpt.com/blank.html') acp_api_send_request( self.driver, 'setOptions', {'options': { 'antiCaptchaApiKey': config.CAPTCHA_KEY }})
def login(): options = webdriver.FirefoxOptions() options.add_argument('-headless') global driver driver = webdriver.Firefox(options=options) url = 'https://www.instacart.com' driver.get(url) request_cookies_browser = driver.get_cookies() data = { "user": { "email": "*****@*****.**", "password": "******" }, "authenticity_token": "" } headers = { 'user-agent': 'Mozilla/5.0', 'x-requested-with': 'XMLHttpRequest' } s = requests.Session() c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser] res = s.get('https://www.instacart.com/', headers={'user-agent': 'Mozilla/5.0'}) soup = BeautifulSoup(res.text, 'lxml') token = soup.select_one("[name='csrf-token']").get('content') data["authenticity_token"] = token resp = s.post("https://www.instacart.com/accounts/login", json=data, headers=headers) dict_resp_cookies = resp.cookies.get_dict() response_cookies_browser = [{ 'name': name, 'value': value } for name, value in dict_resp_cookies.items()] c = [driver.add_cookie(c) for c in response_cookies_browser]
data_dir = os.path.join(BASE_DIR, 'data', 'taipei_shop_rent_price') data_info_path = os.path.join(data_dir, 'data_info.csv') data_info = DataInfo(data_info_path) download_dirpath = data_info.get_download_dirpath() main_xhr_response_filepath = os.path.join(download_dirpath, '591_xhr_responses.json') output_filename = '591_lat_long_lookup.json' output_filepath = os.path.join(download_dirpath, output_filename) list_post_id = get_listing_list_id(main_xhr_response_filepath) # set webdriver, request interceptor scope, and wait object print("note: this scrapping will take hours (there are some brakes " "to respect the website). The program heavily depend on your internet connection") print("INFO: setup crawler, use Firefox driver") webdriver_options = webdriver.FirefoxOptions() if option == 'hide': webdriver_options.headless = True elif option == 'show': webdriver_options.headless = False driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=webdriver_options) driver.set_page_load_timeout(60) url_regex = '.*maps\.google\.com\.tw\/maps?.*' driver.scopes = [url_regex] start_url = 'https://www.591.com.tw/' web_explorer = WebExplorer591(driver, start_url) check_page(driver)
def bot(id): global args, locks, urls, user_agents, referers, proxies, drivers, watched_videos while True: try: url = choice(urls) with locks[0]: if len(proxies) == 0: proxies.extend(get_proxies()) proxy = choice(proxies) proxies.remove(proxy) log('[INFO][%d] Connecting to %s' % (id, proxy)) user_agent = choice( user_agents) if args.user_agent else user_agents( os=('win', 'android')) log('[INFO][%d] Setting user agent to %s' % (id, user_agent)) if args.slow_start: locks[1].acquire() if system() == 'Windows': executable_dir = path_join(environ['APPDATA'], 'DeBos', 'drivers') else: executable_dir = path_join(environ['HOME'], '.DeBos', 'drivers') seleniumwire_options = { 'proxy': { 'http': 'http://%s' % proxy, 'https': 'https://%s' % proxy, 'no_proxy': 'localhost,127.0.0.1' } } if args.driver == 'chrome': chrome_options = webdriver.ChromeOptions() chrome_options.add_argument( '--user-agent={}'.format(user_agent)) chrome_options.add_argument('--mute-audio') chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) if args.headless: chrome_options.add_argument('--headless') if is_root(): chrome_options.add_argument('--no-sandbox') if system() == 'Windows': executable_path = path_join(executable_dir, 'chromedriver.exe') else: executable_path = path_join(executable_dir, 'chromedriver') driver = webdriver.Chrome( options=chrome_options, seleniumwire_options=seleniumwire_options, executable_path=executable_path) else: firefox_options = webdriver.FirefoxOptions() firefox_options.preferences.update({ 'media.volume_scale': '0.0', 'general.useragent.override': user_agent }) if args.headless: firefox_options.add_argument('--headless') if system() == 'Windows': executable_path = path_join(executable_dir, 'geckodriver.exe') else: executable_path = path_join(executable_dir, 'geckodriver') driver = webdriver.Firefox( options=firefox_options, seleniumwire_options=seleniumwire_options, service_log_path=devnull, executable_path=executable_path) driver.header_overrides = {'Referer': choice(referers)} process = driver.service.process pid = process.pid cpids = [x.pid for x in Process(pid).children()] pids = [pid] + cpids drivers.extend(pids) if args.slow_start: locks[1].release() log('[INFO][%d] Successully started webdriver!' % id) driver.set_page_load_timeout(45) log('[INFO][%d] Opening %s' % (id, url)) driver.get(url) if driver.title.endswith('YouTube'): log('[INFO][%d] Video successfully loaded!' % id) try: WebDriverWait(driver, 3).until( EC.element_to_be_clickable( (By.CLASS_NAME, 'ytp-large-play-button'))).click() except: pass if args.duration: sleep(args.duration) else: video = WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.CLASS_NAME, 'html5-main-video'))) video_duration = driver.execute_script( 'return arguments[0].getDuration()', video) sleep(float(video_duration) * uniform(0.35, 0.85)) log('[INFO][%d] Video successfully viewed!' % id) if not args.verbose: watched_videos += 1 else: log('[INFO][%d] Dead proxy eliminated!' % id) except WebDriverException as e: log('[WARNING][%d] %s' % (id, e.__class__.__name__)) except NoSuchProcess: log('[WARNING][%d] NoSuchProcess' % id) except KeyboardInterrupt: exit(0) except: exit(1) finally: log('[INFO][%d] Quitting webdriver!' % id) try: driver except NameError: pass else: driver.quit() with locks[2]: try: pids except NameError: pass else: for pid in pids: try: drivers.remove(pid) except: pass
def bot(id): global args, locks, urls, user_agents, referers, proxies, drivers, watched_ads while True: try: url = choice(urls) with locks[0]: if len(proxies) == 0: proxies.extend(get_proxies()) proxy = choice(proxies) proxies.remove(proxy) log('[INFO][%d] Connecting to %s' % (id, proxy)) user_agent = choice( user_agents) if args.user_agent else user_agents() log('[INFO][%d] Setting user agent to %s' % (id, user_agent)) if args.slow_start: locks[1].acquire() if system() == 'Windows': executable_dir = path_join(environ['APPDATA'], 'DeBos', 'drivers') else: executable_dir = path_join(environ['HOME'], '.DeBos', 'drivers') seleniumwire_options = { 'proxy': { 'http': 'http://%s' % proxy, 'https': 'https://%s' % proxy, 'no_proxy': 'localhost,127.0.0.1' } } if args.driver == 'chrome': chrome_options = webdriver.ChromeOptions() chrome_options.add_argument( '--user-agent={}'.format(user_agent)) chrome_options.add_argument('--mute-audio') chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) if args.headless: chrome_options.add_argument('--headless') if is_root(): chrome_options.add_argument('--no-sandbox') if system() == 'Windows': executable_path = path_join(executable_dir, 'chromedriver.exe') else: executable_path = path_join(executable_dir, 'chromedriver') driver = webdriver.Chrome( options=chrome_options, seleniumwire_options=seleniumwire_options, executable_path=executable_path) else: firefox_options = webdriver.FirefoxOptions() firefox_options.preferences.update({ 'media.volume_scale': '0.0', 'general.useragent.override': user_agent }) if args.headless: firefox_options.add_argument('--headless') if system() == 'Windows': executable_path = path_join(executable_dir, 'geckodriver.exe') else: executable_path = path_join(executable_dir, 'geckodriver') driver = webdriver.Firefox( options=firefox_options, seleniumwire_options=seleniumwire_options, service_log_path=devnull, executable_path=executable_path) driver.header_overrides = {'Referer': choice(referers)} process = driver.service.process pid = process.pid cpids = [x.pid for x in Process(pid).children()] pids = [pid] + cpids drivers.extend(pids) if args.slow_start: locks[1].release() log('[INFO][%d] Successully started webdriver!' % id) driver.set_page_load_timeout(60) log('[INFO][%d] Opening %s' % (id, url)) driver.get(url) if driver.title == 'Shrink your URLs and get paid!': log('[INFO][%d] Website successfully loaded!' % id) WebDriverWait(driver, 10).until( EC.element_to_be_clickable( (By.ID, 'skip_bu2tton'))).click() log('[INFO][%d] Ad successfully viewed!' % id) if not args.verbose: watched_ads += 1 else: log('[WARNING][%d] Dead proxy eliminated!' % id) except WebDriverException as e: log('[WARNING][%d] %s' % (id, e.__class__.__name__)) except KeyboardInterrupt: exit(0) except: exit(1) finally: log('[INFO][%d] Quitting webdriver!' % id) try: driver except NameError: pass else: driver.quit() with locks[2]: try: pids except NameError: pass else: for pid in pids: try: drivers.remove(pid) except: pass
def csdn(url): print('CSDN') option = webdriver.FirefoxOptions() time.sleep(3) driver = webdriver.Firefox(firefox_options=option) driver.implicitly_wait(15) driver.header_overrides = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Cache-Control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0', 'Connection': 'keep-alive', 'Referer': 'https://blog.csdn.net/haibo0668/article/details/80025077' } driver.get(url) print("a1") cookie_ori = driver.get_cookies() print(cookie_ori) print(type(cookie_ori)) print(len(cookie_ori)) cookie_ori_len = len(cookie_ori) l = [] for i in range(cookie_ori_len): # print(cookie_ori[i]['name']) name = cookie_ori[i]['name'] # print(cookie_ori[i]['value']) value = cookie_ori[i]['value'] t = (name, value) print(t) l.append(t) print(l) options = { 'encoding': 'UTF-8', 'custom-header': [('Accept', '*/*'), ('Accept-Language', 'zh-CN,zh;q=0.9'), ('Cache-Control', 'max-age=0'), ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' ), ('Connection', 'keep-alive'), ('Referer', 'https://blog.csdn.net/haibo0668/article/details/80025077'), ('Accept-Encoding', 'gzip, deflate, br'), ('Host', 'img-blog.csdn.net')], 'cookie': l } locator = (By.ID, "btn-readmore") randomInt = random.randint(0, 10) fileName = 'pdffile' + str(randomInt) + '.pdf' print('filename + ' + str(fileName)) try: WebDriverWait(driver, 15, 0.5).until(EC.presence_of_element_located(locator)) print('阅读更多按钮找到') except Exception as e: print(e) print("等待错了") pass try: time.sleep(2) print("开始爬取网页") logging.warning("开始爬取网页") time.sleep(3) html = driver.page_source print(type(html)) print(html) time.sleep(2) print("找阅读更多按钮") try: target = driver.find_element_by_id("btn-readmore") driver.execute_script("arguments[0].scrollIntoView();", target) target.click() print("找阅读更多按钮OK") html = driver.page_source print(type(html)) except Exception as e: print(e) try: time.sleep(1) print("找main Body") content_div = etree.HTML(html).xpath( '//div[@class="blog-content-box"]')[0] content_byte = etree.tostring(content_div) content_str = bytes.decode(content_byte) html = content_str print("找main Body OK") time.sleep(1) except Exception as e: print(e) print('test' + str(fileName)) pdfkit.from_string(html, fileName, options=options) print(fileName) except Exception as e: print(e) print("大概率按钮没找到") try: print('test' + str(fileName)) print("pdf2") pdfkit.from_string(html, fileName, options=options) print(fileName) print("pdf") except Exception as e: print(e) print("OK4") driver.quit()
def generate_webdriver(headless=False, log_file=None): options = webdriver.FirefoxOptions() if headless: log_info("Started in headless mode", log_file=log_file) options.add_argument("-headless") return webdriver.Firefox(firefox_options=options)