示例#1
0
    def get_browser_instance(self):
        try:
            if self.browser.lower() == "firefox":
                options = FirefoxOptions()
                if self.headless:
                    options.add_argument("--headless")
                #options.add_argument("--disable-gpu")
                profile = webdriver.FirefoxProfile()
                #options.add_argument("--private")
                # options.add_argument("-width=1920")
                # options.add_argument("-height=1080")
                profile.accept_untrusted_certs = True
                driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), firefox_profile=profile,
                                           options=options)

            elif self.browser.lower() == "chrome":
                chrome_options = Options()
                if self.headless:
                    chrome_options.add_argument('headless')
                #chrome_options.add_argument('window-size=1920x1080')
                chrome_options.add_argument('ignore-certificate-errors')
                chrome_options.add_argument('--incognito')
                chrome_options.add_argument('--start-maximized')
                # chrome_options.add_experimental_option('prefs', {'geolocation': True})
                chrome_options.add_experimental_option('useAutomationExtension', False)
                chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
                chrome_options.add_argument('--log-level=3')
                # driver = webdriver.Chrome(options=chrome_options, executable_path='drivers//chromedriver.exe')
                driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

            elif self.browser.lower() == "ie":
                driver = webdriver.Ie(IEDriverManager().install())

            elif self.browser.lower() == "edge":
                options = EdgeOptions()
                if self.headless:
                    options.add_argument('headless')
                options.use_chromium = True
                #options.add_argument('window-size=1920x1080')
                options.add_argument('ignore-certificate-errors')
                options.add_experimental_option('useAutomationExtension', False)
                options.add_argument('--inprivate')
                options.add_argument('--log-level=3')
                options.add_experimental_option("excludeSwitches", ["enable-automation"])
                driver = webdriver.Chrome(EdgeChromiumDriverManager().install(), options=options)

            elif self.browser.lower() == 'browserstack':
                # bs_local = Local()
                # bs_local_args = {"key": key,"localIdentifier": localIdentifier}
                # bs_local.start(**bs_local_args)
                driver = webdriver.Remote(command_executor=bb_url, desired_capabilities=browser_config)

            else:
                raise ValueError

            if self.headless:
                self.cl.info("Starting " + str(self.browser).upper() + " browser in headless mode")
            else:
                self.cl.info("Starting " + str(self.browser).upper() + " browser ")
            driver.maximize_window()
            # if self.baseUrl:
            #     driver.get(self.baseUrl)
            #     self.cl.info("Opening the URL :: " + str(self.baseUrl))

            driver.implicitly_wait(5)
            driver.delete_all_cookies()
            driver.set_page_load_timeout(20)
            return driver


        except ValueError as e:
            self.cl.error("Browser not supported :: " + str(
                self.browser) + ". Supported browser types are Chrome, Firefox, Edge. Exception occurred. :: " + str(
                e.__class__.__name__) + ' ' + str(e))
            raise e

        except Exception as e:
            self.cl.error("Exception occurred. :: " + str(
                e.__class__.__name__) + ' ' + str(e))
            raise e
示例#2
0
文件: pmg.py 项目: zebiden/pmg
    def __init__(self, write_dir):
        self.is_installed_as_module = os.path.exists(
            os.path.abspath(selenium.__file__).split("selenium")[0] +
            "piptv_pmg")
        # Test to see if requests can be sent to CDN nodes
        self.cdn_nodes = ['peer1.ustv.to', 'peer2.ustv.to', 'peer3.ustv.to']

        self.channel_codes = [
            'ABCE', 'A&E', 'AMC', 'APL', 'BBCA', 'BET', 'BOOM', 'BRVO', 'CNE',
            'CBSE', 'CMT', 'CNBC', 'CNN', 'COM', 'DEST', 'DSC', 'DISE',
            'DISJR', 'DXD', 'DIY', 'E!', 'ESPN', 'ESPN2', 'FOOD', 'FBN',
            'FOXE', 'FNC', 'FS1', 'FS2', 'FREEFM', 'FX', 'FXM', 'FXX', 'GOLF',
            'GSN', 'HALL', 'HMM', 'HBO', 'HGTV', 'HIST', 'HLN', 'ID', 'LIFE',
            'LIFEMOV', 'MLBN', 'MTHD', 'MSNBC', 'MTV', 'NGW', 'NGC', 'NBA',
            'NBCSN', 'NBCE', 'NFLHD', 'NIKE', 'NKTN', 'OWN', 'OXGN', 'PAR',
            'PBSE', 'POP', 'SCI', 'SHO', 'STARZ', 'SUND', 'SYFY', 'TBS', 'TCM',
            'TELE', 'TNNS', 'CWE', 'WEATH', 'TLC', 'TNT', 'TRAV', 'TruTV',
            'TVLD', 'UNVSO', 'USA', 'VH1', 'WE'
        ]

        self.cdn_channel_codes = [
            'ABC', 'AE', 'AMC', 'Animal', 'BBCAmerica', 'BET', 'Boomerang',
            'Bravo', 'CN', 'CBS', 'CMT', 'CNBC', 'CNN', 'Comedy', 'DA',
            'Discovery', 'Disney', 'DisneyJr', 'DisneyXD', 'DIY', 'E', 'ESPN',
            'ESPN2', 'FoodNetwork', 'FoxBusiness', 'FOX', 'FoxNews', 'FS1',
            'FS2', 'Freeform', 'FX', 'FXMovie', 'FXX', 'GOLF', 'GSN',
            'Hallmark', 'HMM', 'HBO', 'HGTV', 'History', 'HLN', 'ID',
            'Lifetime', 'LifetimeM', 'MLB', 'MotorTrend', 'MSNBC', 'MTV',
            'NatGEOWild', 'NatGEO', 'NBA', 'NBCSN', 'NBC', 'NFL',
            'Nickelodeon', 'Nicktoons', 'OWN', 'Oxygen', 'Paramount', 'PBS',
            'POP', 'Science', 'Showtime', 'StarZ', 'SundanceTV', 'SYFY', 'TBS',
            'TCM', 'Telemundo', 'Tennis', 'CWE',
            'https://weather-lh.akamaihd.net/i/twc_1@92006/master.m3u8', 'TLC',
            'TNT', 'Travel', 'TruTV', 'TVLand', 'Univision', 'USANetwork',
            'VH1', 'WETV'
        ]

        self.write_dir = write_dir
        self.profile = webdriver.FirefoxProfile()
        self.options = FirefoxOptions()
        # Need to configure a VM for macOS testing
        if platform.system() == "Windows" and self.is_installed_as_module:
            print(
                "\nDetected Windows...\n \nTrying to set environment variable for geckodriver\n"
            )
            self.resource_dir = str(os.path.abspath(selenium.__file__)).split(
                "selenium")[0] + "\\piptv_pmg\\resource\\"
            self.set_environment_variable(self.resource_dir +
                                          "\\geckodriver_win64")
        elif platform.system(
        ) == "Windows" and not self.is_installed_as_module:
            self.resource_dir = os.getcwd().split(
                "piptv_pmg")[0] + "\\resource\\"
            self.set_environment_variable(self.resource_dir +
                                          "\\geckodriver_win64")
        elif platform.system() == "Linux" and self.is_installed_as_module:
            print(
                "\nDetected Linux...\n \nTrying to set environment variable for geckodriver\n"
            )
            self.resource_dir = str(os.path.abspath(selenium.__file__)).split(
                "selenium")[0] + "piptv_pmg/resource/"
            self.set_environment_variable(self.resource_dir +
                                          "geckodriver_linux64")
        elif platform.system() == "Linux" and not self.is_installed_as_module:
            self.resource_dir = os.getcwd().split(
                "piptv_pmg")[0] + "/resource/"
            self.set_environment_variable(self.resource_dir +
                                          "geckodriver_linux64")
        self.options.add_argument("-headless")
        self.driver = webdriver.Firefox(self.profile, options=self.options)
        self.renew_token_node = 'http://ustvgo.tv/nfl-network-live-free'
        self.wms_auth_token = {}
        self.generated_links = []

        self.extract_embedded_hotlink = \
            """return (() => {
示例#3
0
"Den Haag HS",
"Den Haag Centraal",
"Voorburg",
"Delft",
"Schagen,
"Rotterdam Centraal"
]

# for every combination of a city and a location:
for city in cities:
    time = []
    for location in locations:

        # starts a webbrowser which searches for the time. We use a special python module, since the ns.nl website is dynamic, so we need to use a function which not only waits for the page to load, but also to activate the dynamic features, which calculate the time.
        url = "https://www.ns.nl/en/journeyplanner/#/?vertrek=" + location + "&vertrektype=treinstation&aankomst=" + city + "&aankomsttype=treinstation&type=vertrek&tijd=2019-12-03T08:30&_requesttime=1574408283341"
        options = FirefoxOptions()
        options.add_argument("--headless")
        browser = webdriver.Firefox(options=options)
        browser.get(url)
        page = browser.page_source
        html = BeautifulSoup(page, "lxml")
        divs = html.find("time", {"data-ng-attr-datetime": "{{ summaryCtrl.durationString }}"})
        try:
            #saving the time in an array
            time.append(parseResult(divs["datetime"]))
            print(location, " - " ,city, parseResult(divs["datetime"]))
        except:
            # catching errors which might appear due to bad network
            print(location, city, "Error")
            time.append(int(0))
示例#4
0
    def main(self):
        # The page contains HTML, lets scrape it --------------------------------------------------
        firefox_options = FirefoxOptions()

        # Adding a specific user agent
        firefox_options.add_argument("user-agent=fri-ieps-kslk")
        firefox_options.add_argument("--headless")

        print(f"[PageHandler] Retrieving web page URL '{self.page_url}'")
        self.driver = webdriver.Firefox(
            options=firefox_options,
            executable_path=Config.WEB_DRIVER_LOCATION_GECKO)
        self.driver.set_page_load_timeout(10)

        self.driver.get(self.page_url)

        # Timeout needed for Web page to render (read more about it)
        time.sleep(Config.RENDERING_TIMEOUT)

        self.html_content = self.driver.page_source

        # Checking for duplicates ------------------------------------------------------------------
        self.hashed_content = hashlib.md5(
            self.html_content.encode("utf-8")).hexdigest()

        is_duplicate = self.session.query(Page).filter(
            Page.content_hash == self.hashed_content).first()
        if is_duplicate:
            self.page_db.page_type_code = "DUPLICATE"
            self.page_db.http_status_code = self.status_code
            self.page_db.site_id = self.site_id
            self.page_db.url = self.page_url
            self.page_db.accessed_time = getTimestamp()
            self.page_db.content_hash = self.hashed_content
            self.session.commit()
            self.session.close()
            self.driver.quit()
            return

        # The page is valid html and its not a duplicate, now we extract all the links on the page ---
        links = []

        # First, we extract the links with tag name "a"
        elems = self.driver.find_elements_by_tag_name("a")
        for elem in elems:
            href = elem.get_attribute('href')
            if href is None:
                continue
            if href.startswith("/"):
                links.append(self.base_url + href)
            elif href is not None and ("http" in href or "https" in href):
                links.append(href)

        # We also extract links from the onclick sections
        onclicks = self.driver.find_elements_by_xpath("//*[@onclick]")
        for el in onclicks:
            temp = el.get_attribute("onclick")
            if "location.href=" in temp:
                temp = temp.replace("location.href=", "")\
                    .replace("\'", "")\
                    .replace("\"", "")
                links.append(temp)

        # Remove the links that point outside of .gov
        links_trancuted = []
        for el in links:
            if "gov.si/" in el:
                links_trancuted.append(el)

        links = links_trancuted

        # Put the links in the canonical form
        links_canonical = []
        for el in links:
            parsed_link = urlcanon.parse_url(el)
            urlcanon.whatwg(parsed_link)
            links_canonical.append(str(parsed_link))

        links = links_canonical

        # Save the links to the DB -----------------------------------------------------------------
        for link in links:
            # Check if link is already in the DB
            is_duplicate = self.session.query(Page).filter(
                Page.url == link).first()
            if is_duplicate is None:
                extracted_domain_name = get_domain_name_from_url(link)

                page = Page()
                page.site_id = self.get_site_id_for_page(extracted_domain_name)

                # Pages with status == None have yet to be visited
                page.status = None
                page.page_type_code = "FRONTIER"
                page.url = link
                self.session.add(page)
                self.session.commit()

                # Also add a Link to the DB
                link_ = Link()
                link_.from_page = self.page_id
                link_.to_page = self.session.query(Page).filter(
                    Page.url == link).first().id
                self.session.add(link_)
                self.session.commit()
            #else:
            #    print(f"Page {link} is already in the DB")

        # Finding and storing the images on the page --------------------------------------------------
        imgs = self.driver.find_elements_by_tag_name("img")
        for elem in imgs:
            src = elem.get_attribute("src")
            url = ""
            if src is None:
                continue
            if src.startswith("/"):
                url = self.base_url + src
            elif src is not None and ("http" in src or "https" in src):
                url = src
            if url != "" and len(url) <= 255:
                # Save the image
                image = Image()
                image.page_id = self.page_id
                image.filename = url
                image.content_type = "BINARY"
                image.accessed_time = getTimestamp()
                self.session.add(image)
                self.session.commit()

        # With all the data scraped, we can save the page to the DB -------------------------------------
        self.page_db.html_content = self.html_content
        self.page_db.accessed_time = getTimestamp()
        self.page_db.content_hash = self.hashed_content
        self.page_db.http_status_code = self.status_code
        self.page_db.site_id = self.site_id
        self.page_db.page_type_code = "HTML"
        self.page_db.url = self.page_url
        self.session.commit()

        # Lets be responsible and close the session and the driver
        self.session.close()
        self.driver.quit()
示例#5
0
def dashboard():
    form = generalforms.searchCSCCode()
    if not session.get('logged_in'):
        return home()
    #get profile picture
    profile_pic = 'tux1.png'

    c, con = connection()
    # return a search on items from the hotlist table.
    hotlist_search = "select * FROM uabc.Inventory inner join uabc.HotList on Inventory.CS_CODE=HotList.CS_CODE where HotList.UserEmail = '{email}';".format(
        email=session['email'])
    c.execute(hotlist_search)

    # Use the column headers as the dictionary key on the search
    columns = c.description

    hotlist = [{
        columns[index][0]: column
        for index, column in enumerate(value)
    } for value in c.fetchall()]

    if request.method == "GET":
        return render_template('dashboard.html',
                               hotlist=hotlist,
                               profile_pic=profile_pic,
                               form=form)

    if request.method == "POST":
        sku = request.form['csc_val']

        # Start webdriver
        #----------------------------------------------------------------------
        # id of the Item CSC Code
        id = "ContentPlaceHolderBody_tbCscCode"

        # name of the Item Name box
        name = "ctl00$ContentPlaceHolderBody$tbCscCode"

        options = FirefoxOptions()
        options.add_argument("--headless")
        driver = webdriver.Firefox(options=options)
        driver.get(
            "https://webapps2.abc.utah.gov/Production/OnlineInventoryQuery/IQ/InventoryQuery.aspx"
        )
        itemNameSearchBox = driver.find_element_by_name(
            "ctl00$ContentPlaceHolderBody$tbItemName")

        itemIdSearchBox = driver.find_element_by_id(id)
        itemIdSearchBox.send_keys(sku)
        itemIdSearchBox.send_keys(Keys.ENTER)
        time.sleep(4)
        html = driver.page_source
        soup_it(html, sku, c)

        driver.close()
        # End Web driver
        # ----------------------------------------------------------------------


        inventorysearch = "SELECT CS_CODE, CON_SIZE, CASE_PACK, PRODUCT_NAME,  STATUS, CURRENT_PRICE FROM `uabc`.`Inventory` " \
            "WHERE CS_CODE = '{csc_val}';".format(csc_val=sku)

        c = con.cursor()
        c.execute(inventorysearch)
        columns = c.description

        results = [{
            columns[index][0]: column
            for index, column in enumerate(value)
        } for value in c.fetchall()]
        c.close()
        #pprint.pprint(results)
        return render_template('dashboard.html',
                               results=results,
                               hotlist=hotlist,
                               profile_pic=profile_pic,
                               form=form)

    return render_template('dashboard.html', form=form)
class WebDriverFactory:
    """
    Фабрика, которая оборачивает стандартный селениум драйвер
    в кастомный класс и возвращает его для дальнейшего использования.

    Тип стандартного драйвера определяется посредством
    аргумента командной строки.

    По дефолту использует драйвера, расположенные в папке driver_executables.
    Если по какой-то причине драйвер не может быть найден внутри проекта,
    то поиск драйвера происходит через системные переменные пути.
    """

    DriverConfig = NamedTuple(
        'DriverConfig',
        [('driver_class', type),
         ('options', DriverOptionsType),
         ('driver_executable', str)]
    )

    default_driver = DriverConfig(
        webdriver.Chrome, ChromeOptions(), 'chromedriver.exe'
    )
    browser_to_driver_config_mapping = {
        'Chrome': DriverConfig(
            webdriver.Chrome, ChromeOptions(), 'chromedriver.exe'
        ),
        'Firefox':  DriverConfig(
            webdriver.Firefox, FirefoxOptions(), 'geckodriver.exe'
        ),
        'Opera': DriverConfig(
            webdriver.Opera, OperaOptions(), 'operadriver.exe'
        ),
        'Edge': DriverConfig(
            Edge, EdgeOptions(), 'MicrosoftWebDriver.exe'
        )
    }
    
    def __init__(
            self, browser_type: str, is_headless: bool, base_dir: str,
            implicit_wait_in_seconds: int, maximize_window: bool = True
    ):
        self.browser_type = browser_type
        self.is_headless = is_headless
        self.webdrivers_dir = os.path.join(base_dir, 'driver_executables')
        self.implicit_wait_in_seconds = implicit_wait_in_seconds
        self.maximize_window = maximize_window
        
    def get_webdriver_instance(self) -> SeleniumDriverWrapper:
        self._configure_driver_options(self.driver_config.options)
        driver = self.driver_config.driver_class(
            executable_path=self.executable_path,
            options=self.driver_config.options
        )
        self._configure_driver(driver)
        return SeleniumDriverWrapper(driver)
    
    def _configure_driver(self, driver: WebDriver) -> None:
        driver.implicitly_wait(self.implicit_wait_in_seconds)
        if self.maximize_window:
            driver.maximize_window()

    def _configure_driver_options(
            self, driver_options: DriverOptionsType
    ) -> None:
        if self.is_headless:
            driver_options.use_chromium = True  # для Edge
            driver_options.add_argument('--headless')

    @property
    def driver_config(self) -> DriverConfig:
        return self.browser_to_driver_config_mapping.get(
            self.browser_type, self.default_driver
        )

    @property
    def executable_path(self) -> str:
        local_path = os.path.join(
            self.webdrivers_dir, self.driver_config.driver_executable
        )
        return local_path if Path(local_path).is_file() \
            else self.driver_config.driver_executable
示例#7
0
def geckodriver_browser():
    opts = FirefoxOptions()
    opts.headless = True
    return webdriver.Firefox(options=opts)
 def open_my_browser(self, setup):
     curr_path = self.get_current_path_of_project()
     if setup.get('setup').get('browser').lower() == 'firefox':
         options = FirefoxOptions()
         if setup.get('setup').get('headless').lower() == 'true':
             options.headless = True
         else:
             options.headless = False
         options.set_preference(
             'pdfjs.previousHandler.alwaysAskBeforeHandling', False)
         options.set_preference('browser.download.folderList', 2)
         options.set_preference(
             'browser.download.dir', curr_path +
             self.format_os_path(self.FIREFOX_DOWNLOAD_LOCATION))
         options.set_preference('browser.download.panel.shown', False)
         options.set_preference(
             "browser.helperApps.neverAsk.saveToDisk", "application/csv," +
             "text/csv," + "application/x-msexcel,application/excel," +
             "application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
             + "application/x-excel,application/vnd.ms-excel" +
             "application / xml")
         if self.get_current_os().lower() == 'windows':
             driver = webdriver.Firefox(
                 capabilities=None,
                 options=options,
                 executable_path=curr_path +
                 self.format_os_path(self.WINDOWS_FIREFOX_DRIVER_PATH))
         else:
             driver = webdriver.Firefox(
                 capabilities=None,
                 options=options,
                 executable_path=curr_path +
                 self.format_os_path(self.LINUX_FIREFOX_DRIVER_PATH))
         driver.maximize_window()
     else:
         options = ChromeOptions()
         if setup.get('setup').get('headless').lower() == 'true':
             options.add_argument("headless")
         else:
             options.add_argument("--start-maximized")
         prefs = {
             "profile.default_content_settings.popups":
             0,
             "download.default_directory":
             curr_path + self.format_os_path(self.CHROME_DOWNLOAD_LOCATION),
             "directory_upgrade":
             True
         }
         options.add_experimental_option("prefs", prefs)
         if self.get_current_os().lower() == 'windows':
             driver = webdriver.Chrome(
                 chrome_options=options,
                 executable_path=curr_path +
                 self.format_os_path(self.WINDOWS_CHROME_DRIVER_PATH))
         else:
             driver = webdriver.Chrome(
                 chrome_options=options,
                 executable_path=curr_path +
                 self.format_os_path(self.LINUX_CHROME_DRIVER_PATH))
         if setup.get('setup').get('headless') == 'True':
             self.enable_download_in_headless_chrome(
                 driver, curr_path +
                 self.format_os_path(self.CHROME_DOWNLOAD_LOCATION))
     driver.get(
         setup.get(BuiltIn().get_variable_value("${RESOURCE}")).get('url'))
     self.debug('Opened browser with session id %s.' % driver.session_id)
     return self.ctx.register_driver(driver, None)