예제 #1
0
    def initiate_webdriven_session(self):
        if not self.url:
            raise Exception("self.url not set")

        options = webdriver.FirefoxOptions()
        if os.environ.get("SELENIUM_VISIBLE", False):
            options.headless = False
        else:
            options.headless = True
        options.accept_insecure_certs = True
        webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local")
        if webdriver_conn == "local":
            # See README instruction for installing geckodriver
            self.webdriver = webdriver.Firefox(options=options)
        else:
            capabilities = options.to_capabilities()
            self.webdriver = webdriver.Remote(
                webdriver_conn,
                desired_capabilities=capabilities,
                keep_alive=True,
            )

        self.webdriver.implicitly_wait(30)
        self.webdriver.set_window_size(5000, 10000)
        self.wait = WebDriverWait(self.webdriver, 20)
        self.webdriver.get(self.url)
        self.cookies = normalize_cookies(self.webdriver.get_cookies())
예제 #2
0
 def initiate_webdriven_session(self):
     if not self.url:
         raise Exception("self.url not set")
     self.webdriver = webdriver.PhantomJS(
         executable_path=phantomjs_executable_path,
         service_args=["--ignore-ssl-errors=true", "--ssl-protocol=any"],
         # uncomment line below to see webdriver log
         service_log_path=os.path.devnull,
     )
     self.webdriver.implicitly_wait(30)
     self.webdriver.set_window_size(5000, 3000)
     self.wait = WebDriverWait(self.webdriver, 10)
     self.webdriver.get(self.url)
     self.cookies = normalize_cookies(self.webdriver.get_cookies())
예제 #3
0
파일: ala.py 프로젝트: Allan198/juriscraper
 def _download(self, request_dict={}):
     """Alabama requires a login in order to work. Here, we login, set the
     cookies and then run the usual download method.
     """
     if self.method == 'LOCAL':
         # No need for cookies when testing.
         return super(Site, self)._download(request_dict={})
     else:
         r = requests.post(
             'http://2.alalinc.net/session/login/',
             data={'uid': 'juriscraper', 'pwd': 'freelaw'},
             headers={'User-Agent': 'Juriscraper'}
         )
         self.cookies = normalize_cookies(r.cookies)
         return super(Site, self)._download(request_dict={'cookies': self.cookies})
예제 #4
0
파일: ala.py 프로젝트: brianwc/juriscraper
 def _download(self, request_dict={}):
     """Alabama requires a login in order to work. Here, we login, set the
     cookies and then run the usual download method.
     """
     if self.method == 'LOCAL':
         # No need for cookies when testing.
         return super(Site, self)._download(request_dict={})
     else:
         r = requests.post('http://2.alalinc.net/session/login/',
                           data={
                               'uid': 'juriscraper',
                               'pwd': 'freelaw'
                           },
                           headers={'User-Agent': 'Juriscraper'})
         self.cookies = normalize_cookies(r.cookies)
         return super(
             Site, self)._download(request_dict={'cookies': self.cookies})
예제 #5
0
    def set_cookies(self):
        """Hit the main URL, and get the cookies so we can use them elsewhere.

        This gets around some of their throttling mechanisms.
        """
        logger.info("Running Selenium browser PhantomJS to get the cookies...")
        add_delay(20, 5)
        driver = webdriver.PhantomJS(
            executable_path=phantomjs_executable_path,
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )

        driver.set_window_size(1920, 1080)
        driver.get(self.url)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.NAME, "dtEndDate")))
        self.cookies = normalize_cookies(driver.get_cookies())
        driver.close()
예제 #6
0
    def set_cookies(self):
        """Hit the main URL, and get the cookies so we can use them elsewhere.

        This gets around some of their throttling mechanisms.
        """
        logger.info("Running Selenium browser PhantomJS to get the cookies...")
        add_delay(20, 5)
        driver = webdriver.PhantomJS(
            executable_path='/usr/local/phantomjs/phantomjs',
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )

        driver.set_window_size(1920, 1080)
        driver.get(self.url)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.NAME, "dtEndDate"))
        )
        self.cookies = normalize_cookies(driver.get_cookies())
        driver.close()
예제 #7
0
    def initiate_webdriven_session(self):
        if not self.url:
            raise Exception("self.url not set")

        webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local")
        if webdriver_conn == "local":
            # See README instruction for installing geckodriver
            options = webdriver.FirefoxOptions()
            # comment line below to see browser interaction window
            options.headless = True
            options.accept_insecure_certs = True
            self.webdriver = webdriver.Firefox(options=options)

        else:
            # It's a connection string to a remote driver
            options = webdriver.ChromeOptions()
            if not os.environ.get("SELENIUM_VISIBLE", False):
                options.add_argument("headless")
            options.add_argument("silent")
            options.add_experimental_option("w3c", False)

            # Workaround for
            # https://bugs.chromium.org/p/chromium/issues/detail?id=1033941
            arg = "--disable-features=AvoidFlashBetweenNavigation,PaintHolding"
            options.add_argument(arg)

            capabilities = options.to_capabilities()
            self.webdriver = webdriver.Remote(
                webdriver_conn,
                desired_capabilities=capabilities,
                keep_alive=True,
            )

        self.webdriver.implicitly_wait(30)
        self.webdriver.set_window_size(5000, 10000)
        self.wait = WebDriverWait(self.webdriver, 20)
        self.webdriver.get(self.url)
        self.cookies = normalize_cookies(self.webdriver.get_cookies())
예제 #8
0
파일: tex.py 프로젝트: janderse/juriscraper
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']"))
            )
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format(
                    court_nr=self.courts['sc'])
                )
                if search_supreme_court.is_selected():
                    ActionChains(driver).click(search_supreme_court).perform()

                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                ActionChains(driver).click(search_court_type).perform()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            #driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments"))
            )
            self.status = 200
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            html_pages = []
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.quit()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                else:
                    logger.info("Paginating through %s pages of results." %
                                nr_of_pages.text)
                    logger.info("  Getting page 1")
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in range(int(nr_of_pages.text) - 1):
                        logger.info("  Getting page %s" % (i + 2))
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.quit()
            return html_pages
예제 #9
0
파일: tex.py 프로젝트: Allan198/juriscraper
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )
            driver.set_window_size(1920, 1080)
            driver.get(self.url)

            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            driver.implicitly_wait(10)
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                search_court_type.click()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=5)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('out2.png')

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()
            driver.implicitly_wait(20)
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.close()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    return html_tree
                else:
                    html_pages = []
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in xrange(int(nr_of_pages.text) - 1):
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.close()
                    return html_pages
예제 #10
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)
            ]
            self.records_nr = len(html_tree_list[0].xpath(
                "//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )
            driver.set_window_size(1920, 1080)
            driver.get(self.url)

            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            driver.implicitly_wait(10)
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_court_type = driver.find_element_by_id(
                    "ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".
                    format(court_nr=self.courts[self.court_name]))
                search_court_type.click()

            search_opinions = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys(
                (self.case_date - timedelta(days=5)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('out2.png')

            submit = driver.find_element_by_id(
                "ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()
            driver.implicitly_wait(20)
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]'
            )
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]'
            )
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.close()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    return html_tree
                else:
                    html_pages = []
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in xrange(int(nr_of_pages.text) - 1):
                        next_page = driver.find_element_by_class_name(
                            'rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.close()
                    return html_pages
예제 #11
0
파일: tex.py 프로젝트: Ro5s/juriscraper
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']"))
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())

            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']"))
            )
            if self.court_name == 'sc':
                # Supreme Court is checked by default, so we don't want to
                # check it again.
                pass
            else:
                search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format(
                    court_nr=self.courts['sc'])
                )
                if search_supreme_court.is_selected():
                    ActionChains(driver).click(search_supreme_court).perform()

                search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format(
                    court_nr=self.courts[self.court_name])
                )
                ActionChains(driver).click(search_court_type).perform()

            search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0")
            search_opinions.click()

            search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1")
            search_orders.click()

            start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            # driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments"))
            )
            self.status = 200
            # driver.save_screenshot('out3.png')

            nr_of_pages = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]')
            records_nr = driver.find_element_by_xpath(
                '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]')
            html_pages = []
            if records_nr:
                self.records_nr = int(records_nr.text)
            if nr_of_pages:
                if nr_of_pages.text == '1':
                    text = driver.page_source
                    driver.quit()

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                else:
                    logger.info("Paginating through %s pages of results." %
                                nr_of_pages.text)
                    logger.info("  Getting page 1")
                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)

                    for i in range(int(nr_of_pages.text) - 1):
                        logger.info("  Getting page %s" % (i + 2))
                        next_page = driver.find_element_by_class_name('rgPageNext')
                        next_page.click()
                        driver.implicitly_wait(5)

                        text = driver.page_source

                        html_tree = html.fromstring(text)
                        html_tree.make_links_absolute(self.url)

                        remove_anchors = lambda url: url.split('#')[0]
                        html_tree.rewrite_links(remove_anchors)
                        html_pages.append(html_tree)
                    driver.quit()
            return html_pages
예제 #12
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            html_tree_list = [
                super(Site, self)._download(request_dict=request_dict)]
            self.records_nr = len(html_tree_list[0].xpath(
                "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]")
            )
            return html_tree_list
        else:
            logger.info("Running Selenium browser PhantomJS...")
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )

            driver.set_window_size(1920, 1080)
            driver.get(self.url)
            # Get a screenshot in testing
            # driver.save_screenshot('out.png')

            # Set the cookie
            self.cookies = normalize_cookies(driver.get_cookies())
            # driver.save_screenshot('screenie.png')
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located(
                    (By.ID, "ctl00_Body_C010_ctl00_ctl00_endDate_dateInput")
                )
            )

            start_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_startDate_dateInput")
            start_date.send_keys((self.case_date - timedelta(days=self.interval)).strftime("%m/%d/%Y"))

            end_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_endDate_dateInput")
            end_date.send_keys(self.case_date.strftime("%m/%d/%Y"))
            #driver.save_screenshot('%s.png' % self.case_date)

            submit = driver.find_element_by_id("Body_C010_ctl00_ctl00_btnSearch")
            submit.click()

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, "ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00"))
            )
            self.status = 200
            # driver.save_screenshot('%s.png' % self.case_date)

            try:
                nr_of_pages = driver.find_element_by_xpath(
                    '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[2]')
                records_nr = driver.find_element_by_xpath(
                    '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[1]')
                self.records_nr = int(records_nr.text)
                nr_of_pages = int(nr_of_pages.text)
            except NoSuchElementException:
                try:
                    self.records_nr = len(driver.find_elements_by_xpath(
                        "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]")
                    )
                    nr_of_pages = 1
                except NoSuchElementException:
                    driver.quit()
                    return []
            html_pages = []
            logger.info("records: {}, pages: {}".format(self.records_nr, nr_of_pages))
            if nr_of_pages == 1:
                text = driver.page_source
                driver.quit()

                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_pages.append(html_tree)
            else:
                logger.info("Paginating through %s pages of results." %
                            nr_of_pages)
                logger.info("  Getting page 1")
                text = driver.page_source

                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_pages.append(html_tree)

                for i in xrange(nr_of_pages - 1):
                    logger.info("  Getting page %s" % (i + 2))
                    next_page = driver.find_element_by_class_name('rgPageNext')
                    next_page.click()
                    driver.implicitly_wait(5)

                    text = driver.page_source

                    html_tree = html.fromstring(text)
                    html_tree.make_links_absolute(self.url)

                    remove_anchors = lambda url: url.split('#')[0]
                    html_tree.rewrite_links(remove_anchors)
                    html_pages.append(html_tree)
                driver.quit()
            return html_pages