def scrap(to_sql=False): opts = FirefoxOptions() opts.add_argument("--headless") driver = webdriver.Firefox(options=opts) most_active_url = "https://www.nasdaq.com/market-activity/most-active" driver.get(most_active_url) table = driver.find_element_by_css_selector( 'div.most-active__data-container--share-volume').find_elements_by_css_selector("tr.most-active__row") most_active_list = [] for row in table: # print(row.text) most_active = dict() most_active['symbol'] = row.find_elements_by_css_selector("td.most-active__cell.most-active__cell--heading")[0].text most_active['name'] = row.find_elements_by_css_selector("td.most-active__cell.most-active__cell--heading")[1].text most_active['last'] = row.find_elements_by_css_selector("td.most-active__cell.most-active__cell--heading")[2].text most_active['change'] = row.find_elements_by_css_selector("td.most-active__cell.most-active__cell--heading")[3].text most_active['volume'] = row.find_elements_by_css_selector("td.most-active__cell.most-active__cell--heading")[4].text if to_sql: db.session.add(MostActive(most_active["symbol"], most_active["name"], float(most_active["last"][1:]), float(most_active["change"]), float(most_active["volume"].replace(",","")))) db.session.commit() most_active_list.append(most_active) df = pd.DataFrame(most_active_list) return render_template('scraper/index.html', tables=[df.to_html(classes='data')], titles=df.columns.values)
def __init__(self, browser, user_data=''): """ Run class initialization method, the default is proper to drive the Firefox browser. Of course, you can also pass parameter for other browser, Chrome browser for the "Chrome", the Internet Explorer browser for "internet explorer" or "ie". """ if browser == "ff": self.driver = webdriver.Firefox() elif browser == "ff_headless": ff_options = FirefoxOptions() ff_options.set_headless() self.driver = webdriver.Firefox(firefox_options=ff_options) elif browser == "chrome": self.driver = webdriver.Chrome() elif browser == "internet explorer" or browser == "ie": self.driver = webdriver.Ie() elif browser == "opera": self.driver = webdriver.Opera() elif browser == "chrome_headless": chrome_options = ChromeOptions() chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=chrome_options) elif browser == 'chrome_user_data': # 通过chrome://version/ 查看chrome信息 chrome_options = ChromeOptions() chrome_options.add_argument(user_data) self.driver = webdriver.Chrome(chrome_options=chrome_options) elif browser == 'edge': self.driver = webdriver.Edge() else: raise NameError( "Not found %s browser,You can enter 'ie', 'ff', 'opera', 'edge', 'chrome' or 'chrome_headless'." % browser)
def login1(): opt = FirefoxOptions() # 创建Chrome参数对象 opt.headless = True # 把Chrome设置成可视化无界面模式,windows/Linux 皆可 driver = Firefox(options=opt) # 创建Chrome无界面对象 #selenium登录测试长庆 #driver = webdriver.Firefox() driver.get("http://192.168.6.27:6030/passports/login?service=http%3A%2F%2F192.168.6.27%3A6030%2Fportals%2Fcas&tenantCode=cqsh&trial=false") driver.find_element(By.ID, "username").send_keys("test") driver.find_element(By.ID, "pwd1").send_keys("1") driver.find_element(By.CSS_SELECTOR, ".justUse").click() time.sleep(5) #获取JSESSIONID c= driver.get_cookies() #print (c) #print (c[0]) for a in c: #print (a) if a['name'] == 'JSESSIONID': b=a #print (b) cookies={'JSESSIONID': b['value']} #cookies={'JSESSIONID': '3BAB7DF0381948EA376F907859D5321C'} driver.close() driver.quit() return cookies
def scrape_video_no_protection(url: str) -> str: """ Gets video url directly from page. Raises a NoVideoAvailableException if no player is found """ # opens a driver on the given url options = FirefoxOptions(); options.add_argument("-headless") driver = webdriver.Firefox(options=options) driver.get(url) try: # clicks play to start video and load video url in the page play_button = driver.find_element_by_xpath("//div[@class = '{}']".format(PLAY_BUTTON_CLASS)) play_button.click() # gets video url from page once is loaded video_player_element = driver.find_element_by_xpath("//video[@class = '{}']".format(VIDEO_ELEMENT_CLASS)) video_url = video_player_element.get_attribute('src') except NoSuchElementException: raise NoVideoAvailableException(NO_VIDEO_MESSAGE) # closes driver driver.close() return video_url
def __enter__(self): options = FirefoxOptions() options.set_headless(True) profile = FirefoxProfile() self.browser = Firefox(firefox_options=options, firefox_profile=profile) return self
def __init__( self, firefox_options: FirefoxOptions = None, desired_capabilities: dict = None, token: str = None, project_name: str = None, job_name: str = None, disable_reports: bool = False, report_type: ReportType = ReportType.CLOUD_AND_LOCAL, ): # If no options or capabilities are specified at all, use default FirefoxOptions if firefox_options is None and desired_capabilities is None: caps = FirefoxOptions().to_capabilities() else: # Specified FirefoxOptions take precedence over desired capabilities but either can be used caps = firefox_options.to_capabilities( ) if firefox_options is not None else desired_capabilities super().__init__( capabilities=caps, token=token, project_name=project_name, job_name=job_name, disable_reports=disable_reports, report_type=report_type, )
def write_tf(self,filesize, threadnum, num_tfrecords=10): '''This function writes tfrecords. Input parameters are: filesize (number of images in one tfrecord), threadnum(thread id)''' options = tf.io.TFRecordOptions(tf.io.TFRecordCompressionType.GZIP) opts = FirefoxOptions() opts.headless = True driver = Firefox(executable_path='./geckodriver', options=opts) for idx in range(num_tfrecords): starttime = time.time() output_file_name = '{:02d}_{:08d}.tfrecord'.format(threadnum, idx) print('+Thread {} [{}/{}] generating ...'.format(threadnum, idx+1, num_tfrecords)) data_arr, all_table_categories = self.generate_tables(driver, filesize, output_file_name) if(data_arr is not None): if(len(data_arr)==filesize): with tf.io.TFRecordWriter(os.path.join(self.outtfpath,output_file_name),options=options) as writer: for imgindex,subarr in enumerate(data_arr): arr = subarr[0] img = np.asarray(subarr[1][0],np.int64)[:,:,0] colmatrix = np.array(arr[1],dtype=np.int64) cellmatrix = np.array(arr[2],dtype=np.int64) rowmatrix = np.array(arr[0],dtype=np.int64) bboxes = np.array(arr[3]) tablecategory = arr[4][0] seq_ex = self.generate_tf_record(img, cellmatrix, rowmatrix, colmatrix, bboxes,tablecategory,imgindex,output_file_name) writer.write(seq_ex.SerializeToString()) print('--- thread {} [{}/{}] completed in {:.3f}'.format(threadnum, idx+1, num_tfrecords, time.time() - starttime)) driver.stop_client() driver.quit()
def __init__(self, browser='Chrome'): self.opts = FirefoxOptions( ) if browser == 'Firefox' else ChromeOptions() self.opts.headless = True self.opts.add_argument('--no-sandbox') self.opts.add_argument('--disable-extensions') self.opts.add_argument('--ignore-certificate-errors') self.opts.add_argument('--disable-gpu') self.opts.add_argument("--start-maximized") self.opts.add_argument("--enable-logging") self.opts.add_argument("--enable-automation") self.capabilities = self.opts.capabilities.copy() self.capabilities['acceptSslCerts'] = True self.capabilities['acceptInsecureCerts'] = True self.capabilities['goog:loggingPrefs'] = { 'browser': 'ALL', 'performance': 'ALL' } self.selenoid = { 'host': os.environ.get("SELENOID_HOST"), 'port': os.environ.get("SELENOID_PORT", "4444") } self.driver = None self.ui = None self.adcm = None self._client = None
def __init__(self, headless=True, options=[], path='myengine\geckodriver'): browser_options = FirefoxOptions() for _ in options: browser_options.add_argument(_) browser_options.headless = headless Firefox.__init__(self, options=browser_options, executable_path=path) Browser.__init__(self)
def prepare_browsers(headless: bool, driver_path: str, twitter_profile_path: str) -> Browsers: """ Sets up browsers to search accounts :param headless bool: Should search be performed in headless mode :param driver_path: Path to geckodriver :param twitter_profile_path: Path to twitter profile folder :return: tuple of browsers, that are logged in LinkedIn and Xing """ logging.info("Running Twitter scraper from profile in %s", twitter_profile_path) driver_path = driver_path if driver_path else "geckodriver" profile = FirefoxProfile() twitter_profile = FirefoxProfile(twitter_profile_path) twitter_profile.DEFAULT_PREFERENCES["frozen"][ "extensions.autoDisableScopes"] = 0 twitter_profile.set_preference("extensions.enabledScopes", 15) logins = social_media_logins(driver_path, profile) driver_options = FirefoxOptions() driver_options.headless = headless linked_in_driver = Firefox(options=driver_options, firefox_profile=profile, executable_path=driver_path) xing_driver = Firefox(options=driver_options, firefox_profile=profile, executable_path=driver_path) twitter_driver = Firefox(options=driver_options, firefox_profile=twitter_profile, executable_path=driver_path) set_login_data(linked_in_driver, logins[0]) set_login_data(xing_driver, logins[1]) retoggleAllTheAddons(twitter_driver) return Browsers(linked_in_driver, xing_driver, twitter_driver)
def bake_chapters(start, stop): """ Use Selenium to get the live javascript rendered webpage and then save it requires a geckodriver to be somewhere in the PATH :param start: start with this chapter :param stop: stop at this chapter (inclusive) """ logger.info('{}: bake_chapters task started'.format( current_task.request.id)) logger.debug('{}: Baking chapters: {} to {}'.format( current_task.request.id, start, stop)) opts = FirefoxOptions() opts.add_argument("--headless") driver = webdriver.Firefox(firefox_options=opts) for i in range(start, stop + 1): logger.debug('{}: Bake chapter: {}'.format(current_task.request.id, i)) url = settings.BAKING_WEBPAGES_BASEURL + 'chapter/?chapter={}'.format( i) driver.get(url) container = driver.find_element_by_class_name( 'container').get_attribute('innerHTML') with open(os.path.join(settings.ESTORIA_LOCATION, 'edition/critical', str(i) + '.html'), 'w', encoding='utf-8') as f: f.write(container) logger.info('{}: complete'.format(current_task.request.id))
def launch_application(browser_name, app_url): global driver log.info("in init method of selenium base") try: if browser_name == "chrome": option = ChromeOptions() option.add_argument("start-maximized") option.add_argument("--ignore-certificate-errors") option.add_argument("--disable-extensions") option.add_argument("--disable-infobars") option.add_argument("disable-notifications") driver = Chrome(executable_path="./drivers/chromedriver.exe", options=option) log.info("chrome browser is launch successfully") elif browser_name == "firefox": profile = FirefoxProfile() profile.accept_untrusted_certs = True options = FirefoxOptions() options.add_argument("start-maximized") driver = Firefox(executable_path="./drivers/geckodriver.exe") log.info("firefox browser is launch successfully") elif browser_name == "ie": driver = Ie(executable_path="./drivers/IEDriverServer.exe") else: log.error("browser name is incorrect", browser_name) except WebDriverException: log.critical("exception", WebDriverException) driver.implicitly_wait(5) driver.get(app_url)
def test4(): options = FirefoxOptions() options.add_argument('--headless') dr = webdriver.Firefox(firefox_options=options) dr.get("https://www.baidu.com") print(dr.current_url) dr.close()
def __init__(self): """ Initialize Firefox instance """ opts = FirefoxOptions() opts.add_argument("--headless") self.driver = webdriver.Firefox(firefox_options=opts) self.initiate()
def test_passing_firefox_options(self): firefox_options = FirefoxOptions() firefox_options.add_argument("--headless") self.driver = get_webdriver_for("firefox", options=firefox_options) self.assertTrue(self.driver.capabilities["moz:headless"])
def launch_browser(self, browser_name, url): global driver try: if browser_name == "chrome": chromeoptions = ChromeOptions() chromeoptions.add_argument("start-maximized") chromeoptions.add_argument("disable-notifications") chromeoptions.add_argument("--ignore-certificate-errors") chromeoptions.add_argument("--disable-infobars") chromeoptions.add_argument("--disable-extensions") driver = webdriver.Chrome( executable_path="./drivers/chromedriver.exe", options=chromeoptions) log.info("chrome browser launch successfully") elif browser_name == "firefox": firefoxoptions = FirefoxOptions() firefoxoptions.add_argument("start-maximize") driver = webdriver.Firefox( executable_path="./drivers/geckodriver.exe", options=firefoxoptions) log.info("firefox browser launch successfully") elif browser_name == "ie": ieoptions = IeOptions() ieoptions.add_argument("start-maximize") driver = webdriver.Ie( executable_path="./drivers/IEDriverServer.exe", options=ieoptions) log.info("ie browser launch successfully") else: log.error("invalid browser name") except WebDriverException as e: log.error("exception ", e) driver.implicitly_wait(10) driver.get(url)
def __init__(self, browser="Chrome", downloads: Optional[Union[os.PathLike, str]] = None): self.opts = FirefoxOptions( ) if browser == "Firefox" else ChromeOptions() self.opts.headless = True self.opts.add_argument("--no-sandbox") self.opts.add_argument("--disable-extensions") self.opts.add_argument("--ignore-certificate-errors") self.opts.add_argument("--disable-gpu") self.opts.add_argument("--start-maximized") self.opts.add_argument("--enable-logging") self.opts.add_argument("--enable-automation") if browser == "Chrome": self.opts.add_argument("--window-size=1366,768") else: self.opts.add_argument("--width=1366") self.opts.add_argument("--height=768") self.capabilities = self.opts.capabilities.copy() self.capabilities["acceptSslCerts"] = True self.capabilities["acceptInsecureCerts"] = True self.capabilities["goog:loggingPrefs"] = { "browser": "ALL", "performance": "ALL" } self.selenoid = { "host": os.environ.get("SELENOID_HOST"), "port": os.environ.get("SELENOID_PORT", "4444"), } self._configure_downloads(browser, downloads) self.driver = None self.adcm = None
def __init__(self, config: dict): """ Constructor @param config the configuration to load options from """ # Get the logger self.logger = logging.getLogger(config["log"]["name"]) # Set up firefox to run in headless mode to avoid graphical overhead options = FirefoxOptions() options.set_headless(True) # Configure profile settings profile = FirefoxProfile() # Add the proxy if applicable if config["mode"] == "tor": profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.socks", "127.0.0.1") profile.set_preference("network.proxy.socks_port", 9050) profile.set_preference("network.proxy.socks_remote_dns", True) # Store configs, the profile and options self.retries = config["firefox"]["retries"] self.page_timeout = config["firefox"]["page_timeout"] self.options = options self.profile = profile # Set driver to None for now self.driver = None
def driver_open(url, the_encoding="utf-8", timeout=3): from selenium.webdriver import FirefoxOptions from selenium import webdriver import time opts = FirefoxOptions() opts.add_argument("--headless") driver = webdriver.Firefox(firefox_options=opts) driver.set_page_load_timeout(timeout) #driver.set_script_timeout(3) try: res1 = driver.get( url ) ## may jumpout timeout error, the js has just finish load, reutrn the innerhtml except: time.sleep(5) finally: #time.sleep(5) print("++++++++++++++++++++++++++++++++++++++++") print("++++++++++++ run finnaly +++++++++++++++") print("++++++++++++++++++++++++++++++++++++++++") html2 = driver.execute_script( "return document.documentElement.innerHTML;") soup1 = BS(html2.encode(the_encoding)) driver.close() #os.system('pkill phantomjs') return soup1
def __init__(self, firefox_config: dict, tor_port: int): """ Constructor @param config the configuration to load options from """ # Get the logger self.logger = logging.getLogger() # Set up firefox to run in headless mode to avoid graphical overhead options = FirefoxOptions() options.set_headless(True) # Store the options self.options = options # Store params from the config self.retries = int(firefox_config["retries"]) self.wait_tag = firefox_config["wait_tag"] self.load_images = int(firefox_config["load_images"]) self.clean_frequency = int(firefox_config["clean_frequency"]) self.page_timeout = int(firefox_config["timeout"]["page"]) self.element_timeout = int(firefox_config["timeout"]["element"]) # Store tor proxy config self.tor_port = tor_port # Set driver to None for now self.driver = None # Initialize some members that will be stored later self.mode = None self.profile = None
def __init__(self, position, lit, time): # 首页搜索页 self.start_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html' # 职位详情页url # 搜索关键字[职位,学历要求,工作经验] self.key_words = [position, lit, time] # 会计, # 大专,本科,硕士 # 应届生,3-5年 self.df = pd.DataFrame(columns=['职位', '日期', '地点', '网址']) with open( '职位详情{0}_{1}_{2}.csv'.format(self.key_words[0], self.key_words[1], self.key_words[2]), 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['公司简介', '职位名称', '职位信息']) # 用webdriver options = FirefoxOptions() options.add_argument('-headless') self.browser = Firefox(options=options) self.wait = WebDriverWait(self.browser, 10) with open( '职位详情{0}_{1}_{2}.csv'.format(self.key_words[0], self.key_words[1], self.key_words[2]), 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['公司简介', '职位名称', '职位信息'])
def driver_factory(browser, executor_url, test_name): if browser == "chrome": logger = logging.getLogger('chrome_fixture') logger.setLevel(LOG_LEVEL) caps = { "browserName": browser, "version": "83.0", "enableVnc": True, "enableVideo": True, "enableLog": True, "screenResolution": "1280x720", "name": test_name } driver = EventFiringWebDriver( webdriver.Remote(command_executor=executor_url + "/wd/hub", desired_capabilities=caps), MyListener()) logger.info(f"Start session {driver.session_id}") elif browser == "firefox": profile = FirefoxProfile() profile.accept_untrusted_certs = True options = FirefoxOptions() options.headless = True driver = webdriver.Firefox(options=options, firefox_profile=profile) else: raise Exception("Driver not supported") return driver
def setUp(self): self.db = DBCreatorTester() options = FirefoxOptions() options.add_argument('-headless') self.selenium = Firefox(options=options) self.selenium.implicitly_wait(5)
def __init__(self, folder, profile, username, password): self.folder = folder self.username = username self.password = password firefox_options = FirefoxOptions() firefox_profile = FirefoxProfile('/home/aniquetahir/.mozilla/firefox/'+profile) firefox_options.add_argument('-headless') self.webdriver = Firefox(firefox_profile, executable_path='/home/aniquetahir/youtube-upload-folder/geckodriver',firefox_options=firefox_options, firefox_binary='/home/aniquetahir/firefox/firefox')
def setUp(self): opts = FirefoxOptions() opts.add_argument("--headless") self.browser = webdriver.Firefox(firefox_options=opts) staging_server = os.environ.get("STAGING_SERVER") if staging_server: self.live_server_url = f'http://{staging_server}'
def browser(): options = FirefoxOptions() options.headless = True driver = Firefox( executable_path="/Users/amir/makmal/geckodriver/geckodriver", options=options, ) yield driver driver.close()
def get_mozilla_browser(self): options = FirefoxOptions() options.add_argument("no-sandbox") options.accept_untrusted_certs = True options.assume_untrusted_cert_issuer = True options.add_argument("--disable-infobars") options.add_argument("--headless") driver_ = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=options) return driver_
def __init__(self, testConf): opts = FirefoxOptions() opts.add_argument("--headless") self.driver = webdriver.Firefox( executable_path=GeckoDriverManager().install(), firefox_options=opts) print(testConf) with open(testConf) as testConfFp: self.testConf = json.load(testConfFp)
def test_empty_reset(self): opts = FirefoxOptions() opts.add_argument("--headless") driver = webdriver.Firefox(firefox_options=opts) driver.get("http://127.0.0.1:8000/verificacion/") element = driver.find_element_by_id("id_input") element.send_keys(" ") button = driver.find_element_by_id("reset") button.click() self.assertIn("", driver.find_element_by_id("id_input").text)
def test_example(): gdd = GeckoDriverManager() gdd.download_and_install() option = FirefoxOptions() option.add_argument("--kiosk") # option.headless = True wd = webdriver.Firefox(options=option) wd.get("https://otus.ru/") assert wd.title == 'Онлайн‑курсы для профессионалов, дистанционное обучение современным профессиям' wd.quit()