def get_browser_instance(self): try: if self.browser.lower() == "firefox": options = FirefoxOptions() if self.headless: options.add_argument("--headless") #options.add_argument("--disable-gpu") profile = webdriver.FirefoxProfile() #options.add_argument("--private") # options.add_argument("-width=1920") # options.add_argument("-height=1080") profile.accept_untrusted_certs = True driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), firefox_profile=profile, options=options) elif self.browser.lower() == "chrome": chrome_options = Options() if self.headless: chrome_options.add_argument('headless') #chrome_options.add_argument('window-size=1920x1080') chrome_options.add_argument('ignore-certificate-errors') chrome_options.add_argument('--incognito') chrome_options.add_argument('--start-maximized') # chrome_options.add_experimental_option('prefs', {'geolocation': True}) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_argument('--log-level=3') # driver = webdriver.Chrome(options=chrome_options, executable_path='drivers//chromedriver.exe') driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options) elif self.browser.lower() == "ie": driver = webdriver.Ie(IEDriverManager().install()) elif self.browser.lower() == "edge": options = EdgeOptions() if self.headless: options.add_argument('headless') options.use_chromium = True #options.add_argument('window-size=1920x1080') options.add_argument('ignore-certificate-errors') options.add_experimental_option('useAutomationExtension', False) options.add_argument('--inprivate') options.add_argument('--log-level=3') options.add_experimental_option("excludeSwitches", ["enable-automation"]) driver = webdriver.Chrome(EdgeChromiumDriverManager().install(), options=options) elif self.browser.lower() == 'browserstack': # bs_local = Local() # bs_local_args = {"key": key,"localIdentifier": localIdentifier} # bs_local.start(**bs_local_args) driver = webdriver.Remote(command_executor=bb_url, desired_capabilities=browser_config) else: raise ValueError if self.headless: self.cl.info("Starting " + str(self.browser).upper() + " browser in headless mode") else: self.cl.info("Starting " + str(self.browser).upper() + " browser ") driver.maximize_window() # if self.baseUrl: # driver.get(self.baseUrl) # self.cl.info("Opening the URL :: " + str(self.baseUrl)) driver.implicitly_wait(5) driver.delete_all_cookies() driver.set_page_load_timeout(20) return driver except ValueError as e: self.cl.error("Browser not supported :: " + str( self.browser) + ". Supported browser types are Chrome, Firefox, Edge. Exception occurred. :: " + str( e.__class__.__name__) + ' ' + str(e)) raise e except Exception as e: self.cl.error("Exception occurred. :: " + str( e.__class__.__name__) + ' ' + str(e)) raise e
def __init__(self, write_dir): self.is_installed_as_module = os.path.exists( os.path.abspath(selenium.__file__).split("selenium")[0] + "piptv_pmg") # Test to see if requests can be sent to CDN nodes self.cdn_nodes = ['peer1.ustv.to', 'peer2.ustv.to', 'peer3.ustv.to'] self.channel_codes = [ 'ABCE', 'A&E', 'AMC', 'APL', 'BBCA', 'BET', 'BOOM', 'BRVO', 'CNE', 'CBSE', 'CMT', 'CNBC', 'CNN', 'COM', 'DEST', 'DSC', 'DISE', 'DISJR', 'DXD', 'DIY', 'E!', 'ESPN', 'ESPN2', 'FOOD', 'FBN', 'FOXE', 'FNC', 'FS1', 'FS2', 'FREEFM', 'FX', 'FXM', 'FXX', 'GOLF', 'GSN', 'HALL', 'HMM', 'HBO', 'HGTV', 'HIST', 'HLN', 'ID', 'LIFE', 'LIFEMOV', 'MLBN', 'MTHD', 'MSNBC', 'MTV', 'NGW', 'NGC', 'NBA', 'NBCSN', 'NBCE', 'NFLHD', 'NIKE', 'NKTN', 'OWN', 'OXGN', 'PAR', 'PBSE', 'POP', 'SCI', 'SHO', 'STARZ', 'SUND', 'SYFY', 'TBS', 'TCM', 'TELE', 'TNNS', 'CWE', 'WEATH', 'TLC', 'TNT', 'TRAV', 'TruTV', 'TVLD', 'UNVSO', 'USA', 'VH1', 'WE' ] self.cdn_channel_codes = [ 'ABC', 'AE', 'AMC', 'Animal', 'BBCAmerica', 'BET', 'Boomerang', 'Bravo', 'CN', 'CBS', 'CMT', 'CNBC', 'CNN', 'Comedy', 'DA', 'Discovery', 'Disney', 'DisneyJr', 'DisneyXD', 'DIY', 'E', 'ESPN', 'ESPN2', 'FoodNetwork', 'FoxBusiness', 'FOX', 'FoxNews', 'FS1', 'FS2', 'Freeform', 'FX', 'FXMovie', 'FXX', 'GOLF', 'GSN', 'Hallmark', 'HMM', 'HBO', 'HGTV', 'History', 'HLN', 'ID', 'Lifetime', 'LifetimeM', 'MLB', 'MotorTrend', 'MSNBC', 'MTV', 'NatGEOWild', 'NatGEO', 'NBA', 'NBCSN', 'NBC', 'NFL', 'Nickelodeon', 'Nicktoons', 'OWN', 'Oxygen', 'Paramount', 'PBS', 'POP', 'Science', 'Showtime', 'StarZ', 'SundanceTV', 'SYFY', 'TBS', 'TCM', 'Telemundo', 'Tennis', 'CWE', 'https://weather-lh.akamaihd.net/i/twc_1@92006/master.m3u8', 'TLC', 'TNT', 'Travel', 'TruTV', 'TVLand', 'Univision', 'USANetwork', 'VH1', 'WETV' ] self.write_dir = write_dir self.profile = webdriver.FirefoxProfile() self.options = FirefoxOptions() # Need to configure a VM for macOS testing if platform.system() == "Windows" and self.is_installed_as_module: print( "\nDetected Windows...\n \nTrying to set environment variable for geckodriver\n" ) self.resource_dir = str(os.path.abspath(selenium.__file__)).split( "selenium")[0] + "\\piptv_pmg\\resource\\" self.set_environment_variable(self.resource_dir + "\\geckodriver_win64") elif platform.system( ) == "Windows" and not self.is_installed_as_module: self.resource_dir = os.getcwd().split( "piptv_pmg")[0] + "\\resource\\" self.set_environment_variable(self.resource_dir + "\\geckodriver_win64") elif platform.system() == "Linux" and self.is_installed_as_module: print( "\nDetected Linux...\n \nTrying to set environment variable for geckodriver\n" ) self.resource_dir = str(os.path.abspath(selenium.__file__)).split( "selenium")[0] + "piptv_pmg/resource/" self.set_environment_variable(self.resource_dir + "geckodriver_linux64") elif platform.system() == "Linux" and not self.is_installed_as_module: self.resource_dir = os.getcwd().split( "piptv_pmg")[0] + "/resource/" self.set_environment_variable(self.resource_dir + "geckodriver_linux64") self.options.add_argument("-headless") self.driver = webdriver.Firefox(self.profile, options=self.options) self.renew_token_node = 'http://ustvgo.tv/nfl-network-live-free' self.wms_auth_token = {} self.generated_links = [] self.extract_embedded_hotlink = \ """return (() => {
"Den Haag HS", "Den Haag Centraal", "Voorburg", "Delft", "Schagen, "Rotterdam Centraal" ] # for every combination of a city and a location: for city in cities: time = [] for location in locations: # starts a webbrowser which searches for the time. We use a special python module, since the ns.nl website is dynamic, so we need to use a function which not only waits for the page to load, but also to activate the dynamic features, which calculate the time. url = "https://www.ns.nl/en/journeyplanner/#/?vertrek=" + location + "&vertrektype=treinstation&aankomst=" + city + "&aankomsttype=treinstation&type=vertrek&tijd=2019-12-03T08:30&_requesttime=1574408283341" options = FirefoxOptions() options.add_argument("--headless") browser = webdriver.Firefox(options=options) browser.get(url) page = browser.page_source html = BeautifulSoup(page, "lxml") divs = html.find("time", {"data-ng-attr-datetime": "{{ summaryCtrl.durationString }}"}) try: #saving the time in an array time.append(parseResult(divs["datetime"])) print(location, " - " ,city, parseResult(divs["datetime"])) except: # catching errors which might appear due to bad network print(location, city, "Error") time.append(int(0))
def main(self): # The page contains HTML, lets scrape it -------------------------------------------------- firefox_options = FirefoxOptions() # Adding a specific user agent firefox_options.add_argument("user-agent=fri-ieps-kslk") firefox_options.add_argument("--headless") print(f"[PageHandler] Retrieving web page URL '{self.page_url}'") self.driver = webdriver.Firefox( options=firefox_options, executable_path=Config.WEB_DRIVER_LOCATION_GECKO) self.driver.set_page_load_timeout(10) self.driver.get(self.page_url) # Timeout needed for Web page to render (read more about it) time.sleep(Config.RENDERING_TIMEOUT) self.html_content = self.driver.page_source # Checking for duplicates ------------------------------------------------------------------ self.hashed_content = hashlib.md5( self.html_content.encode("utf-8")).hexdigest() is_duplicate = self.session.query(Page).filter( Page.content_hash == self.hashed_content).first() if is_duplicate: self.page_db.page_type_code = "DUPLICATE" self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.url = self.page_url self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.session.commit() self.session.close() self.driver.quit() return # The page is valid html and its not a duplicate, now we extract all the links on the page --- links = [] # First, we extract the links with tag name "a" elems = self.driver.find_elements_by_tag_name("a") for elem in elems: href = elem.get_attribute('href') if href is None: continue if href.startswith("/"): links.append(self.base_url + href) elif href is not None and ("http" in href or "https" in href): links.append(href) # We also extract links from the onclick sections onclicks = self.driver.find_elements_by_xpath("//*[@onclick]") for el in onclicks: temp = el.get_attribute("onclick") if "location.href=" in temp: temp = temp.replace("location.href=", "")\ .replace("\'", "")\ .replace("\"", "") links.append(temp) # Remove the links that point outside of .gov links_trancuted = [] for el in links: if "gov.si/" in el: links_trancuted.append(el) links = links_trancuted # Put the links in the canonical form links_canonical = [] for el in links: parsed_link = urlcanon.parse_url(el) urlcanon.whatwg(parsed_link) links_canonical.append(str(parsed_link)) links = links_canonical # Save the links to the DB ----------------------------------------------------------------- for link in links: # Check if link is already in the DB is_duplicate = self.session.query(Page).filter( Page.url == link).first() if is_duplicate is None: extracted_domain_name = get_domain_name_from_url(link) page = Page() page.site_id = self.get_site_id_for_page(extracted_domain_name) # Pages with status == None have yet to be visited page.status = None page.page_type_code = "FRONTIER" page.url = link self.session.add(page) self.session.commit() # Also add a Link to the DB link_ = Link() link_.from_page = self.page_id link_.to_page = self.session.query(Page).filter( Page.url == link).first().id self.session.add(link_) self.session.commit() #else: # print(f"Page {link} is already in the DB") # Finding and storing the images on the page -------------------------------------------------- imgs = self.driver.find_elements_by_tag_name("img") for elem in imgs: src = elem.get_attribute("src") url = "" if src is None: continue if src.startswith("/"): url = self.base_url + src elif src is not None and ("http" in src or "https" in src): url = src if url != "" and len(url) <= 255: # Save the image image = Image() image.page_id = self.page_id image.filename = url image.content_type = "BINARY" image.accessed_time = getTimestamp() self.session.add(image) self.session.commit() # With all the data scraped, we can save the page to the DB ------------------------------------- self.page_db.html_content = self.html_content self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.page_type_code = "HTML" self.page_db.url = self.page_url self.session.commit() # Lets be responsible and close the session and the driver self.session.close() self.driver.quit()
def dashboard(): form = generalforms.searchCSCCode() if not session.get('logged_in'): return home() #get profile picture profile_pic = 'tux1.png' c, con = connection() # return a search on items from the hotlist table. hotlist_search = "select * FROM uabc.Inventory inner join uabc.HotList on Inventory.CS_CODE=HotList.CS_CODE where HotList.UserEmail = '{email}';".format( email=session['email']) c.execute(hotlist_search) # Use the column headers as the dictionary key on the search columns = c.description hotlist = [{ columns[index][0]: column for index, column in enumerate(value) } for value in c.fetchall()] if request.method == "GET": return render_template('dashboard.html', hotlist=hotlist, profile_pic=profile_pic, form=form) if request.method == "POST": sku = request.form['csc_val'] # Start webdriver #---------------------------------------------------------------------- # id of the Item CSC Code id = "ContentPlaceHolderBody_tbCscCode" # name of the Item Name box name = "ctl00$ContentPlaceHolderBody$tbCscCode" options = FirefoxOptions() options.add_argument("--headless") driver = webdriver.Firefox(options=options) driver.get( "https://webapps2.abc.utah.gov/Production/OnlineInventoryQuery/IQ/InventoryQuery.aspx" ) itemNameSearchBox = driver.find_element_by_name( "ctl00$ContentPlaceHolderBody$tbItemName") itemIdSearchBox = driver.find_element_by_id(id) itemIdSearchBox.send_keys(sku) itemIdSearchBox.send_keys(Keys.ENTER) time.sleep(4) html = driver.page_source soup_it(html, sku, c) driver.close() # End Web driver # ---------------------------------------------------------------------- inventorysearch = "SELECT CS_CODE, CON_SIZE, CASE_PACK, PRODUCT_NAME, STATUS, CURRENT_PRICE FROM `uabc`.`Inventory` " \ "WHERE CS_CODE = '{csc_val}';".format(csc_val=sku) c = con.cursor() c.execute(inventorysearch) columns = c.description results = [{ columns[index][0]: column for index, column in enumerate(value) } for value in c.fetchall()] c.close() #pprint.pprint(results) return render_template('dashboard.html', results=results, hotlist=hotlist, profile_pic=profile_pic, form=form) return render_template('dashboard.html', form=form)
class WebDriverFactory: """ Фабрика, которая оборачивает стандартный селениум драйвер в кастомный класс и возвращает его для дальнейшего использования. Тип стандартного драйвера определяется посредством аргумента командной строки. По дефолту использует драйвера, расположенные в папке driver_executables. Если по какой-то причине драйвер не может быть найден внутри проекта, то поиск драйвера происходит через системные переменные пути. """ DriverConfig = NamedTuple( 'DriverConfig', [('driver_class', type), ('options', DriverOptionsType), ('driver_executable', str)] ) default_driver = DriverConfig( webdriver.Chrome, ChromeOptions(), 'chromedriver.exe' ) browser_to_driver_config_mapping = { 'Chrome': DriverConfig( webdriver.Chrome, ChromeOptions(), 'chromedriver.exe' ), 'Firefox': DriverConfig( webdriver.Firefox, FirefoxOptions(), 'geckodriver.exe' ), 'Opera': DriverConfig( webdriver.Opera, OperaOptions(), 'operadriver.exe' ), 'Edge': DriverConfig( Edge, EdgeOptions(), 'MicrosoftWebDriver.exe' ) } def __init__( self, browser_type: str, is_headless: bool, base_dir: str, implicit_wait_in_seconds: int, maximize_window: bool = True ): self.browser_type = browser_type self.is_headless = is_headless self.webdrivers_dir = os.path.join(base_dir, 'driver_executables') self.implicit_wait_in_seconds = implicit_wait_in_seconds self.maximize_window = maximize_window def get_webdriver_instance(self) -> SeleniumDriverWrapper: self._configure_driver_options(self.driver_config.options) driver = self.driver_config.driver_class( executable_path=self.executable_path, options=self.driver_config.options ) self._configure_driver(driver) return SeleniumDriverWrapper(driver) def _configure_driver(self, driver: WebDriver) -> None: driver.implicitly_wait(self.implicit_wait_in_seconds) if self.maximize_window: driver.maximize_window() def _configure_driver_options( self, driver_options: DriverOptionsType ) -> None: if self.is_headless: driver_options.use_chromium = True # для Edge driver_options.add_argument('--headless') @property def driver_config(self) -> DriverConfig: return self.browser_to_driver_config_mapping.get( self.browser_type, self.default_driver ) @property def executable_path(self) -> str: local_path = os.path.join( self.webdrivers_dir, self.driver_config.driver_executable ) return local_path if Path(local_path).is_file() \ else self.driver_config.driver_executable
def geckodriver_browser(): opts = FirefoxOptions() opts.headless = True return webdriver.Firefox(options=opts)
def open_my_browser(self, setup): curr_path = self.get_current_path_of_project() if setup.get('setup').get('browser').lower() == 'firefox': options = FirefoxOptions() if setup.get('setup').get('headless').lower() == 'true': options.headless = True else: options.headless = False options.set_preference( 'pdfjs.previousHandler.alwaysAskBeforeHandling', False) options.set_preference('browser.download.folderList', 2) options.set_preference( 'browser.download.dir', curr_path + self.format_os_path(self.FIREFOX_DOWNLOAD_LOCATION)) options.set_preference('browser.download.panel.shown', False) options.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/csv," + "text/csv," + "application/x-msexcel,application/excel," + "application/vnd.openxmlformats-officedocument.wordprocessingml.document," + "application/x-excel,application/vnd.ms-excel" + "application / xml") if self.get_current_os().lower() == 'windows': driver = webdriver.Firefox( capabilities=None, options=options, executable_path=curr_path + self.format_os_path(self.WINDOWS_FIREFOX_DRIVER_PATH)) else: driver = webdriver.Firefox( capabilities=None, options=options, executable_path=curr_path + self.format_os_path(self.LINUX_FIREFOX_DRIVER_PATH)) driver.maximize_window() else: options = ChromeOptions() if setup.get('setup').get('headless').lower() == 'true': options.add_argument("headless") else: options.add_argument("--start-maximized") prefs = { "profile.default_content_settings.popups": 0, "download.default_directory": curr_path + self.format_os_path(self.CHROME_DOWNLOAD_LOCATION), "directory_upgrade": True } options.add_experimental_option("prefs", prefs) if self.get_current_os().lower() == 'windows': driver = webdriver.Chrome( chrome_options=options, executable_path=curr_path + self.format_os_path(self.WINDOWS_CHROME_DRIVER_PATH)) else: driver = webdriver.Chrome( chrome_options=options, executable_path=curr_path + self.format_os_path(self.LINUX_CHROME_DRIVER_PATH)) if setup.get('setup').get('headless') == 'True': self.enable_download_in_headless_chrome( driver, curr_path + self.format_os_path(self.CHROME_DOWNLOAD_LOCATION)) driver.get( setup.get(BuiltIn().get_variable_value("${RESOURCE}")).get('url')) self.debug('Opened browser with session id %s.' % driver.session_id) return self.ctx.register_driver(driver, None)