Пример #1
0
    def initiate_webdriven_session(self):
        if not self.url:
            raise Exception("self.url not set")

        options = webdriver.FirefoxOptions()
        if os.environ.get("SELENIUM_VISIBLE", False):
            options.headless = False
        else:
            options.headless = True
        options.accept_insecure_certs = True
        webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local")
        if webdriver_conn == "local":
            # See README instruction for installing geckodriver
            self.webdriver = webdriver.Firefox(options=options)
        else:
            capabilities = options.to_capabilities()
            self.webdriver = webdriver.Remote(
                webdriver_conn,
                desired_capabilities=capabilities,
                keep_alive=True,
            )

        self.webdriver.implicitly_wait(30)
        self.webdriver.set_window_size(5000, 10000)
        self.wait = WebDriverWait(self.webdriver, 20)
        self.webdriver.get(self.url)
        self.cookies = normalize_cookies(self.webdriver.get_cookies())
Пример #2
0
 def initiate_webdriven_session(self):
     if not self.url:
         raise Exception("self.url not set")
     self.webdriver = webdriver.PhantomJS(
         executable_path=phantomjs_executable_path,
         service_args=["--ignore-ssl-errors=true", "--ssl-protocol=any"],
         # uncomment line below to see webdriver log
         service_log_path=os.path.devnull,
     )
     self.webdriver.implicitly_wait(30)
     self.webdriver.set_window_size(5000, 3000)
     self.wait = WebDriverWait(self.webdriver, 10)
     self.webdriver.get(self.url)
     self.cookies = normalize_cookies(self.webdriver.get_cookies())
Пример #3
0
 def _download(self, request_dict={}):
     """Alabama requires a login in order to work. Here, we login, set the
     cookies and then run the usual download method.
     """
     if self.method == 'LOCAL':
         # No need for cookies when testing.
         return super(Site, self)._download(request_dict={})
     else:
         r = requests.post(
             'http://2.alalinc.net/session/login/',
             data={'uid': 'juriscraper', 'pwd': 'freelaw'},
             headers={'User-Agent': 'Juriscraper'}
         )
         self.cookies = normalize_cookies(r.cookies)
         return super(Site, self)._download(request_dict={'cookies': self.cookies})
Пример #4
0
 def _download(self, request_dict={}):
     """Alabama requires a login in order to work. Here, we login, set the
     cookies and then run the usual download method.
     """
     if self.method == 'LOCAL':
         # No need for cookies when testing.
         return super(Site, self)._download(request_dict={})
     else:
         r = requests.post('http://2.alalinc.net/session/login/',
                           data={
                               'uid': 'juriscraper',
                               'pwd': 'freelaw'
                           },
                           headers={'User-Agent': 'Juriscraper'})
         self.cookies = normalize_cookies(r.cookies)
         return super(
             Site, self)._download(request_dict={'cookies': self.cookies})
Пример #5
0
    def set_cookies(self):
        """Hit the main URL, and get the cookies so we can use them elsewhere.

        This gets around some of their throttling mechanisms.
        """
        logger.info("Running Selenium browser PhantomJS to get the cookies...")
        add_delay(20, 5)
        driver = webdriver.PhantomJS(
            executable_path=phantomjs_executable_path,
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )

        driver.set_window_size(1920, 1080)
        driver.get(self.url)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.NAME, "dtEndDate")))
        self.cookies = normalize_cookies(driver.get_cookies())
        driver.close()
Пример #6
0
    def set_cookies(self):
        """Hit the main URL, and get the cookies so we can use them elsewhere.

        This gets around some of their throttling mechanisms.
        """
        logger.info("Running Selenium browser PhantomJS to get the cookies...")
        add_delay(20, 5)
        driver = webdriver.PhantomJS(
            executable_path='/usr/local/phantomjs/phantomjs',
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )

        driver.set_window_size(1920, 1080)
        driver.get(self.url)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.NAME, "dtEndDate"))
        )
        self.cookies = normalize_cookies(driver.get_cookies())
        driver.close()
Пример #7
0
    def initiate_webdriven_session(self):
        if not self.url:
            raise Exception("self.url not set")

        webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local")
        if webdriver_conn == "local":
            # See README instruction for installing geckodriver
            options = webdriver.FirefoxOptions()
            # comment line below to see browser interaction window
            options.headless = True
            options.accept_insecure_certs = True
            self.webdriver = webdriver.Firefox(options=options)

        else:
            # It's a connection string to a remote driver
            options = webdriver.ChromeOptions()
            if not os.environ.get("SELENIUM_VISIBLE", False):
                options.add_argument("headless")
            options.add_argument("silent")
            options.add_experimental_option("w3c", False)

            # Workaround for
            # https://bugs.chromium.org/p/chromium/issues/detail?id=1033941
            arg = "--disable-features=AvoidFlashBetweenNavigation,PaintHolding"
            options.add_argument(arg)

            capabilities = options.to_capabilities()
            self.webdriver = webdriver.Remote(
                webdriver_conn,
                desired_capabilities=capabilities,
                keep_alive=True,
            )

        self.webdriver.implicitly_wait(30)
        self.webdriver.set_window_size(5000, 10000)
        self.wait = WebDriverWait(self.webdriver, 20)
        self.webdriver.get(self.url)
        self.cookies = normalize_cookies(self.webdriver.get_cookies())
Пример #8
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']"))
            )
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format(
                    court_nr=self.courts['sc'])
                )
                if search_supreme_court.is_selected():
                    ActionChains(driver).click(search_supreme_court).perform()

                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                ActionChains(driver).click(search_court_type).perform()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            #driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments"))
            )
            self.status = 200
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            html_pages = []
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.quit()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                else:
                    logger.info("Paginating through %s pages of results." %
                                nr_of_pages.text)
                    logger.info("  Getting page 1")
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in range(int(nr_of_pages.text) - 1):
                        logger.info("  Getting page %s" % (i + 2))
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.quit()
            return html_pages
Пример #9
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )
            driver.set_window_size(1920, 1080)
            driver.get(self.url)

            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            driver.implicitly_wait(10)
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                search_court_type.click()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=5)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('out2.png')

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()
            driver.implicitly_wait(20)
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.close()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    return html_tree
                else:
                    html_pages = []
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in xrange(int(nr_of_pages.text) - 1):
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.close()
                    return html_pages
Пример #10
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)
            ]
            self.records_nr = len(html_tree_list[0].xpath(
                "//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )
            driver.set_window_size(1920, 1080)
            driver.get(self.url)

            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            driver.implicitly_wait(10)
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_court_type = driver.find_element_by_id(
                    "ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".
                    format(court_nr=self.courts[self.court_name]))
                search_court_type.click()

            search_opinions = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys(
                (self.case_date - timedelta(days=5)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('out2.png')

            submit = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()
            driver.implicitly_wait(20)
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]'
            )
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]'
            )
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.close()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    return html_tree
                else:
                    html_pages = []
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in xrange(int(nr_of_pages.text) - 1):
                        next_page = driver.find_element_by_class_name(
                            'rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.close()
                    return html_pages
Пример #11
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']"))
            )
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format(
                    court_nr=self.courts['sc'])
                )
                if search_supreme_court.is_selected():
                    ActionChains(driver).click(search_supreme_court).perform()

                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                ActionChains(driver).click(search_court_type).perform()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments"))
            )
            self.status = 200
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            html_pages = []
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.quit()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                else:
                    logger.info("Paginating through %s pages of results." %
                                nr_of_pages.text)
                    logger.info("  Getting page 1")
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in range(int(nr_of_pages.text) - 1):
                        logger.info("  Getting page %s" % (i + 2))
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.quit()
            return html_pages
Пример #12
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath(
                "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]")
            )
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())
            # driver.save_screenshot('screenie.png')
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located(
                    (By.ID, "ctl00_Body_C010_ctl00_ctl00_endDate_dateInput")
                )
            )

            start_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_startDate_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.interval)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_endDate_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            #driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("Body_C010_ctl00_ctl00_btnSearch")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00"))
            )
            self.status = 200
            # driver.save_screenshot('%s.png' % self.case_date)

            try:
                nr_of_pages = driver.find_element_by_xpath(
                    '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[2]')
                records_nr = driver.find_element_by_xpath(
                    '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[1]')
                self.records_nr = int(records_nr.text)
                nr_of_pages = int(nr_of_pages.text)
            except NoSuchElementException:
                try:
                    self.records_nr = len(driver.find_elements_by_xpath(
                        "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]")
                    )
                    nr_of_pages = 1
                except NoSuchElementException:
                    driver.quit()
                    return []
            html_pages = []
            logger.info("records: {}, pages: {}".format(self.records_nr, nr_of_pages))
            if nr_of_pages == 1:
                text = driver.page_source
                driver.quit()

                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_pages.append(html_tree)
            else:
                logger.info("Paginating through %s pages of results." %
                            nr_of_pages)
                logger.info("  Getting page 1")
                text = driver.page_source

                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_pages.append(html_tree)

                for i in xrange(nr_of_pages - 1):
                    logger.info("  Getting page %s" % (i + 2))
                    next_page = driver.find_element_by_class_name('rgPageNext')
                    next_page.click()
                    driver.implicitly_wait(5)

                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                driver.quit()
            return html_pages