def scrape_edmunds() -> None: global SESSION SESSION = Session() opts = FirefoxOptions() opts.add_argument("--headless") drv = Firefox(options=opts) drv.request_interceptor = interceptor drv.response_interceptor = resp_interceptor next_xpath = ".//a[@aria-label='Pagination left']" for px in range(29, 10000): drv.get( f"https://www.edmunds.com/inventory/srp.html" f"?inventorytype=used,cpo&pagenumber={px}" f"&sort=mileage:asc&radius=500" ) while True: try: btn = drv.find_element(By.XPATH, next_xpath) btn.click() print(drv.current_url) break # TODO detect last page properly except Exception: time.sleep(1) drv.close() SESSION.close()
def __init_driver(self): if self.driver_name.lower() != "firefox": raise NotConfigured("SELENIUM_DRIVER_NAME must be set to 'firefox'") options = FirefoxOptions() if self.browser_executable_path: options.binary_location = self.browser_executable_path for arg in self.driver_arguments: options.add_argument(arg) self.options = options self.driver = Firefox(executable_path=self.driver_executable_path, firefox_options=options) self.initialized = True
def test_existing_capability_v4(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', True): Firefox(desired_capabilities={'test': 'capability'}) capabilties = firefox_super_kwargs['desired_capabilities'] assert capabilties['test'] == 'capability'
def test_no_proxy_v4(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', True): Firefox(seleniumwire_options={'exclude_hosts': 'test_host'}) proxy = firefox_super_kwargs['options'].proxy assert proxy.noProxy == 'test_host'
def test_no_auto_config(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', False): Firefox(seleniumwire_options={'auto_config': False}, capabilities={'test': 'capability'}) capabilties = firefox_super_kwargs['capabilities'] assert 'proxy' not in capabilties
def test_no_proxy(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', False): Firefox(seleniumwire_options={'exclude_hosts': 'test_host'}) capabilties = firefox_super_kwargs['capabilities'] assert capabilties['proxy']['noProxy'] == 'test_host'
def get_html(wd: Firefox, article: Dict[str, Union[int, str]]) -> Dict[str, str]: wd.get(article['link']) print(article['title']) sleep(3) # Scroll to the bottom to load images scrollBottom(wd) # For each image, replace with base64 with open('convert_images.js', 'r') as script_file: script_src = script_file.read() wd.execute_script(script_src) sleep(5) # Get html of the page article_div = wd.find_element_by_css_selector('article') html = article_div.get_property('outerHTML') with open('cleanify.js') as script_file: script_src = script_file.read() wd.execute_script(script_src) sleep(3) html: str = wd.page_source html = html.replace('max-width:680px', 'max-width:90%') article['html'] = html
def test_set_proxy_config_v4(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', True): Firefox() proxy = firefox_super_kwargs['options'].proxy assert proxy.proxyType == ProxyType.MANUAL assert proxy.httpProxy == '127.0.0.1:12345' assert proxy.sslProxy == '127.0.0.1:12345' assert proxy.noProxy == ''
def test_set_proxy_config(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', False): Firefox() capabilties = firefox_super_kwargs['capabilities'] assert capabilties['proxy']['proxyType'] == 'manual' assert capabilties['proxy']['httpProxy'] == '127.0.0.1:12345' assert capabilties['proxy']['sslProxy'] == '127.0.0.1:12345' assert 'noProxy' not in capabilties['proxy'] assert capabilties['acceptInsecureCerts'] is True assert firefox_super_kwargs['options'].proxy is None
def get_links(wd: Firefox, p_type: str, tag: str = None, limit: int = 10) -> List[Dict[str, Union[int, str]]]: links_url = 'https://towardsdatascience.com/latest' if tag: links_url = "https://towardsdatascience.com/tagged/%s" % tag_map[tag] print("Parsing Index Page: %s" % links_url) if p_type == 'trending': links_url = 'https://towardsdatascience.com/trending' wd.get(links_url) sleep(3) links = [] num_articles = 0 articles_parsed = 0 while num_articles < limit: articles = wd.find_elements_by_css_selector('.postArticle') num_articles = len(articles) wd.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(3) while True: for article in articles: title = article.find_element_by_css_selector( '.graf--title').text.strip() # print(title) date = article.find_element_by_css_selector('time').get_attribute( 'datetime') try: claps = int( article.find_element_by_css_selector( '.js-multirecommendCountButton').text) except: claps = 0 try: comments = article.find_element_by_css_selector( '.buttonSet.u-floatRight > a[href]').text comments = int(comments.split(' ')[0]) except NoSuchElementException: comments = 0 try: link = article.find_element_by_css_selector( '.postArticle-content > a').get_attribute('href') except NoSuchElementException: link = article.find_element_by_css_selector( '.postArticle > div:nth-child(2) > a').get_attribute( 'href') if claps < CLAP_THRESHOLD: continue links.append({ 'title': title, 'date': date, 'claps': claps, 'comments': comments, 'link': link }) if len(links) >= limit: break else: print("Getting More articles to match threshold...") articles_parsed += len(articles) wd.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(5) articles = wd.find_elements_by_css_selector( '.postArticle')[articles_parsed:] return links[:limit]
def scrollBottom(wd: Firefox): total_height = int(wd.execute_script("return document.body.scrollHeight;")) for i in range(0, total_height, 500): wd.execute_script("window.scrollTo(0,%i)" % i) sleep(0.5)
config_filename = "config.local.json" with open(config_filename) as config_file: config = json.load(config_file) ff_path = config['FIREFOX_PATH'] gd_path = config['GECKODRIVER_PATH'] if os.path.exists('tmp'): shutil.rmtree('tmp') os.mkdir('tmp') fo = Options() fo.headless = True wd = Firefox(executable_path=gd_path, firefox_binary=ff_path, options=fo) wd.header_overrides = { 'Referer': 'https://twitter.com/freedom', } links = get_links(wd, p_type, args.tag) # TODO Filter out links already parsed and sent for link in links: get_html(wd, link) wd.delete_all_cookies() wd.execute_script('window.localStorage.clear()') wd.execute_script('window.sessionStorage.clear()') # TODO Create TOC
def test_create_backend(self, mock_backend): firefox = Firefox() assert firefox.backend mock_backend.create.assert_called_once_with(addr='127.0.0.1', port=0, options={}) mock_backend.create.return_value.address.assert_called_once_with()
class CustomSeleniumMiddleware(SeleniumMiddleware): def __init__(self, driver_name, driver_executable_path, driver_arguments, browser_executable_path): self.driver_name = driver_name self.driver_executable_path = driver_executable_path self.driver_arguments = driver_arguments self.browser_executable_path = browser_executable_path self.initialized = False self.driver = None @classmethod def from_crawler(cls, crawler): middleware = super().from_crawler(crawler) crawler.signals.connect(middleware.spider_opened, signals.spider_opened) return middleware def spider_opened(self, spider): if self.initialized: return self.__init_driver() def __init_driver(self): if self.driver_name.lower() != "firefox": raise NotConfigured("SELENIUM_DRIVER_NAME must be set to 'firefox'") options = FirefoxOptions() if self.browser_executable_path: options.binary_location = self.browser_executable_path for arg in self.driver_arguments: options.add_argument(arg) self.options = options self.driver = Firefox(executable_path=self.driver_executable_path, firefox_options=options) self.initialized = True def process_request(self, request, spider): if not isinstance(request, SeleniumRequest): return None # remove old seleniumwire responses del self.driver.requests # setup seleniumwire to only save responses to the host we're hitting url = urlparse(request.url) self.driver.scopes = ['{0}://.*{1}'.format(url.scheme, url.netloc)] # setup fake User-Agent self.driver.header_overrides = { "User-Agent": UserAgent().random, } # call selenium for the request response = super().process_request(request, spider) # call seleniumwire for the response http_request = self.driver.wait_for_request(response.url) headers = http_request.response.headers # the remote webserver might send us compressed data, but selenium # seems to only return text, so just drop the Content-Encoding header if headers.get("Content-Encoding", "").lower() == "gzip": del headers["Content-Encoding"] # and finally return a response with headers suitable for caching return HtmlResponse( url=response.url, body=response.body, encoding="utf-8", request=request, headers=headers )
from seleniumwire.webdriver import Firefox from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import os opts = Options() opts.set_preference("browser.download.dir", os.getcwd()) opts.set_preference("browser.download.folderList", 2) opts.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") driver = Firefox(options=opts) driver.get("https://euroleague.tv/video/164898") for request in driver.requests: if request.response: print(request.url, )
def test_no_auto_config_v4(self, firefox_super_kwargs): with patch('seleniumwire.webdriver.SELENIUM_V4', True): Firefox(seleniumwire_options={'auto_config': False}, capabilities={'test': 'capability'}) assert firefox_super_kwargs['options'].proxy is None
def test_accept_insecure_certs(self, firefox_super_kwargs): Firefox() firefox_options = firefox_super_kwargs['options'] assert firefox_options.accept_insecure_certs is True
def test_allow_hijacking_localhost(self, firefox_super_kwargs): Firefox() firefox_options = firefox_super_kwargs['options'] assert firefox_options.preferences['network.proxy.allow_hijacking_localhost'] is True