Пример #1
0
def scrape_edmunds() -> None:

    global SESSION
    SESSION = Session()

    opts = FirefoxOptions()
    opts.add_argument("--headless")

    drv = Firefox(options=opts)
    drv.request_interceptor = interceptor
    drv.response_interceptor = resp_interceptor

    next_xpath = ".//a[@aria-label='Pagination left']"
    for px in range(29, 10000):
        drv.get(
            f"https://www.edmunds.com/inventory/srp.html"
            f"?inventorytype=used,cpo&pagenumber={px}"
            f"&sort=mileage:asc&radius=500"
        )
        while True:
            try:
                btn = drv.find_element(By.XPATH, next_xpath)
                btn.click()
                print(drv.current_url)
                break
            # TODO detect last page properly
            except Exception:
                time.sleep(1)

    drv.close()
    SESSION.close()
Пример #2
0
    def __init_driver(self):
        if self.driver_name.lower() != "firefox":
            raise NotConfigured("SELENIUM_DRIVER_NAME must be set to 'firefox'")
        options = FirefoxOptions()
        if self.browser_executable_path:
            options.binary_location = self.browser_executable_path
        for arg in self.driver_arguments:
            options.add_argument(arg)

        self.options = options

        self.driver = Firefox(executable_path=self.driver_executable_path,
                              firefox_options=options)
        self.initialized = True
Пример #3
0
    def test_existing_capability_v4(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', True):
            Firefox(desired_capabilities={'test': 'capability'})

        capabilties = firefox_super_kwargs['desired_capabilities']

        assert capabilties['test'] == 'capability'
Пример #4
0
    def test_no_proxy_v4(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', True):
            Firefox(seleniumwire_options={'exclude_hosts': 'test_host'})

        proxy = firefox_super_kwargs['options'].proxy

        assert proxy.noProxy == 'test_host'
Пример #5
0
    def test_no_auto_config(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', False):
            Firefox(seleniumwire_options={'auto_config': False}, capabilities={'test': 'capability'})

            capabilties = firefox_super_kwargs['capabilities']

            assert 'proxy' not in capabilties
Пример #6
0
    def test_no_proxy(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', False):
            Firefox(seleniumwire_options={'exclude_hosts': 'test_host'})

        capabilties = firefox_super_kwargs['capabilities']

        assert capabilties['proxy']['noProxy'] == 'test_host'
Пример #7
0
def get_html(wd: Firefox, article: Dict[str, Union[int,
                                                   str]]) -> Dict[str, str]:
    wd.get(article['link'])
    print(article['title'])
    sleep(3)

    # Scroll to the bottom to load images
    scrollBottom(wd)

    # For each image, replace with base64
    with open('convert_images.js', 'r') as script_file:
        script_src = script_file.read()

    wd.execute_script(script_src)

    sleep(5)
    # Get html of the page
    article_div = wd.find_element_by_css_selector('article')

    html = article_div.get_property('outerHTML')

    with open('cleanify.js') as script_file:
        script_src = script_file.read()
        wd.execute_script(script_src)
        sleep(3)
    html: str = wd.page_source
    html = html.replace('max-width:680px', 'max-width:90%')

    article['html'] = html
Пример #8
0
    def test_set_proxy_config_v4(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', True):
            Firefox()

        proxy = firefox_super_kwargs['options'].proxy

        assert proxy.proxyType == ProxyType.MANUAL
        assert proxy.httpProxy == '127.0.0.1:12345'
        assert proxy.sslProxy == '127.0.0.1:12345'
        assert proxy.noProxy == ''
Пример #9
0
    def test_set_proxy_config(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', False):
            Firefox()

        capabilties = firefox_super_kwargs['capabilities']

        assert capabilties['proxy']['proxyType'] == 'manual'
        assert capabilties['proxy']['httpProxy'] == '127.0.0.1:12345'
        assert capabilties['proxy']['sslProxy'] == '127.0.0.1:12345'
        assert 'noProxy' not in capabilties['proxy']
        assert capabilties['acceptInsecureCerts'] is True
        assert firefox_super_kwargs['options'].proxy is None
Пример #10
0
def get_links(wd: Firefox,
              p_type: str,
              tag: str = None,
              limit: int = 10) -> List[Dict[str, Union[int, str]]]:
    links_url = 'https://towardsdatascience.com/latest'

    if tag:
        links_url = "https://towardsdatascience.com/tagged/%s" % tag_map[tag]

    print("Parsing Index Page: %s" % links_url)

    if p_type == 'trending':
        links_url = 'https://towardsdatascience.com/trending'
    wd.get(links_url)
    sleep(3)
    links = []
    num_articles = 0
    articles_parsed = 0
    while num_articles < limit:
        articles = wd.find_elements_by_css_selector('.postArticle')
        num_articles = len(articles)
        wd.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        sleep(3)

    while True:
        for article in articles:
            title = article.find_element_by_css_selector(
                '.graf--title').text.strip()
            # print(title)
            date = article.find_element_by_css_selector('time').get_attribute(
                'datetime')
            try:
                claps = int(
                    article.find_element_by_css_selector(
                        '.js-multirecommendCountButton').text)
            except:
                claps = 0
            try:
                comments = article.find_element_by_css_selector(
                    '.buttonSet.u-floatRight > a[href]').text
                comments = int(comments.split(' ')[0])
            except NoSuchElementException:
                comments = 0

            try:
                link = article.find_element_by_css_selector(
                    '.postArticle-content > a').get_attribute('href')
            except NoSuchElementException:
                link = article.find_element_by_css_selector(
                    '.postArticle > div:nth-child(2) > a').get_attribute(
                        'href')

            if claps < CLAP_THRESHOLD:
                continue

            links.append({
                'title': title,
                'date': date,
                'claps': claps,
                'comments': comments,
                'link': link
            })

        if len(links) >= limit:
            break
        else:
            print("Getting More articles to match threshold...")
            articles_parsed += len(articles)
            wd.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(5)
            articles = wd.find_elements_by_css_selector(
                '.postArticle')[articles_parsed:]

    return links[:limit]
Пример #11
0
def scrollBottom(wd: Firefox):
    total_height = int(wd.execute_script("return document.body.scrollHeight;"))
    for i in range(0, total_height, 500):
        wd.execute_script("window.scrollTo(0,%i)" % i)
        sleep(0.5)
Пример #12
0
        config_filename = "config.local.json"

    with open(config_filename) as config_file:
        config = json.load(config_file)

    ff_path = config['FIREFOX_PATH']
    gd_path = config['GECKODRIVER_PATH']

    if os.path.exists('tmp'):
        shutil.rmtree('tmp')

    os.mkdir('tmp')

    fo = Options()
    fo.headless = True
    wd = Firefox(executable_path=gd_path, firefox_binary=ff_path, options=fo)
    wd.header_overrides = {
        'Referer': 'https://twitter.com/freedom',
    }
    links = get_links(wd, p_type, args.tag)

    # TODO Filter out links already parsed and sent

    for link in links:
        get_html(wd, link)
        wd.delete_all_cookies()
        wd.execute_script('window.localStorage.clear()')
        wd.execute_script('window.sessionStorage.clear()')

    # TODO Create TOC
Пример #13
0
    def test_create_backend(self, mock_backend):
        firefox = Firefox()

        assert firefox.backend
        mock_backend.create.assert_called_once_with(addr='127.0.0.1', port=0, options={})
        mock_backend.create.return_value.address.assert_called_once_with()
Пример #14
0
class CustomSeleniumMiddleware(SeleniumMiddleware):
    def __init__(self, driver_name, driver_executable_path, driver_arguments,
                 browser_executable_path):
        self.driver_name = driver_name
        self.driver_executable_path = driver_executable_path
        self.driver_arguments = driver_arguments
        self.browser_executable_path = browser_executable_path
        self.initialized = False
        self.driver = None

    @classmethod
    def from_crawler(cls, crawler):
        middleware = super().from_crawler(crawler)
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        return middleware

    def spider_opened(self, spider):
        if self.initialized:
            return
        self.__init_driver()

    def __init_driver(self):
        if self.driver_name.lower() != "firefox":
            raise NotConfigured("SELENIUM_DRIVER_NAME must be set to 'firefox'")
        options = FirefoxOptions()
        if self.browser_executable_path:
            options.binary_location = self.browser_executable_path
        for arg in self.driver_arguments:
            options.add_argument(arg)

        self.options = options

        self.driver = Firefox(executable_path=self.driver_executable_path,
                              firefox_options=options)
        self.initialized = True

    def process_request(self, request, spider):
        if not isinstance(request, SeleniumRequest):
            return None

        # remove old seleniumwire responses
        del self.driver.requests

        # setup seleniumwire to only save responses to the host we're hitting
        url = urlparse(request.url)
        self.driver.scopes = ['{0}://.*{1}'.format(url.scheme, url.netloc)]

        # setup fake User-Agent
        self.driver.header_overrides = {
            "User-Agent": UserAgent().random,
        }

        # call selenium for the request
        response = super().process_request(request, spider)

        # call seleniumwire for the response
        http_request = self.driver.wait_for_request(response.url)
        headers = http_request.response.headers

        # the remote webserver might send us compressed data, but selenium
        # seems to only return text, so just drop the Content-Encoding header
        if headers.get("Content-Encoding", "").lower() == "gzip":
            del headers["Content-Encoding"]

        # and finally return a response with headers suitable for caching
        return HtmlResponse(
            url=response.url,
            body=response.body,
            encoding="utf-8",
            request=request,
            headers=headers
        )
Пример #15
0
from seleniumwire.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import os

opts = Options()
opts.set_preference("browser.download.dir", os.getcwd())
opts.set_preference("browser.download.folderList", 2)
opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
                    "application/octet-stream")

driver = Firefox(options=opts)
driver.get("https://euroleague.tv/video/164898")

for request in driver.requests:
    if request.response:
        print(request.url, )
Пример #16
0
    def test_no_auto_config_v4(self, firefox_super_kwargs):
        with patch('seleniumwire.webdriver.SELENIUM_V4', True):
            Firefox(seleniumwire_options={'auto_config': False}, capabilities={'test': 'capability'})

            assert firefox_super_kwargs['options'].proxy is None
Пример #17
0
    def test_accept_insecure_certs(self, firefox_super_kwargs):
        Firefox()

        firefox_options = firefox_super_kwargs['options']
        assert firefox_options.accept_insecure_certs is True
Пример #18
0
    def test_allow_hijacking_localhost(self, firefox_super_kwargs):
        Firefox()

        firefox_options = firefox_super_kwargs['options']
        assert firefox_options.preferences['network.proxy.allow_hijacking_localhost'] is True