def initiate_webdriven_session(self): if not self.url: raise Exception("self.url not set") options = webdriver.FirefoxOptions() if os.environ.get("SELENIUM_VISIBLE", False): options.headless = False else: options.headless = True options.accept_insecure_certs = True webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local") if webdriver_conn == "local": # See README instruction for installing geckodriver self.webdriver = webdriver.Firefox(options=options) else: capabilities = options.to_capabilities() self.webdriver = webdriver.Remote( webdriver_conn, desired_capabilities=capabilities, keep_alive=True, ) self.webdriver.implicitly_wait(30) self.webdriver.set_window_size(5000, 10000) self.wait = WebDriverWait(self.webdriver, 20) self.webdriver.get(self.url) self.cookies = normalize_cookies(self.webdriver.get_cookies())
def initiate_webdriven_session(self): if not self.url: raise Exception("self.url not set") self.webdriver = webdriver.PhantomJS( executable_path=phantomjs_executable_path, service_args=["--ignore-ssl-errors=true", "--ssl-protocol=any"], # uncomment line below to see webdriver log service_log_path=os.path.devnull, ) self.webdriver.implicitly_wait(30) self.webdriver.set_window_size(5000, 3000) self.wait = WebDriverWait(self.webdriver, 10) self.webdriver.get(self.url) self.cookies = normalize_cookies(self.webdriver.get_cookies())
def _download(self, request_dict={}): """Alabama requires a login in order to work. Here, we login, set the cookies and then run the usual download method. """ if self.method == 'LOCAL': # No need for cookies when testing. return super(Site, self)._download(request_dict={}) else: r = requests.post( 'http://2.alalinc.net/session/login/', data={'uid': 'juriscraper', 'pwd': 'freelaw'}, headers={'User-Agent': 'Juriscraper'} ) self.cookies = normalize_cookies(r.cookies) return super(Site, self)._download(request_dict={'cookies': self.cookies})
def _download(self, request_dict={}): """Alabama requires a login in order to work. Here, we login, set the cookies and then run the usual download method. """ if self.method == 'LOCAL': # No need for cookies when testing. return super(Site, self)._download(request_dict={}) else: r = requests.post('http://2.alalinc.net/session/login/', data={ 'uid': 'juriscraper', 'pwd': 'freelaw' }, headers={'User-Agent': 'Juriscraper'}) self.cookies = normalize_cookies(r.cookies) return super( Site, self)._download(request_dict={'cookies': self.cookies})
def set_cookies(self): """Hit the main URL, and get the cookies so we can use them elsewhere. This gets around some of their throttling mechanisms. """ logger.info("Running Selenium browser PhantomJS to get the cookies...") add_delay(20, 5) driver = webdriver.PhantomJS( executable_path=phantomjs_executable_path, service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.NAME, "dtEndDate"))) self.cookies = normalize_cookies(driver.get_cookies()) driver.close()
def set_cookies(self): """Hit the main URL, and get the cookies so we can use them elsewhere. This gets around some of their throttling mechanisms. """ logger.info("Running Selenium browser PhantomJS to get the cookies...") add_delay(20, 5) driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.NAME, "dtEndDate")) ) self.cookies = normalize_cookies(driver.get_cookies()) driver.close()
def initiate_webdriven_session(self): if not self.url: raise Exception("self.url not set") webdriver_conn = os.environ.get("WEBDRIVER_CONN", "local") if webdriver_conn == "local": # See README instruction for installing geckodriver options = webdriver.FirefoxOptions() # comment line below to see browser interaction window options.headless = True options.accept_insecure_certs = True self.webdriver = webdriver.Firefox(options=options) else: # It's a connection string to a remote driver options = webdriver.ChromeOptions() if not os.environ.get("SELENIUM_VISIBLE", False): options.add_argument("headless") options.add_argument("silent") options.add_experimental_option("w3c", False) # Workaround for # https://bugs.chromium.org/p/chromium/issues/detail?id=1033941 arg = "--disable-features=AvoidFlashBetweenNavigation,PaintHolding" options.add_argument(arg) capabilities = options.to_capabilities() self.webdriver = webdriver.Remote( webdriver_conn, desired_capabilities=capabilities, keep_alive=True, ) self.webdriver.implicitly_wait(30) self.webdriver.set_window_size(5000, 10000) self.wait = WebDriverWait(self.webdriver, 20) self.webdriver.get(self.url) self.cookies = normalize_cookies(self.webdriver.get_cookies())
def _download(self, request_dict={}): self.request_dict = request_dict if self.method == 'LOCAL': html_tree_list = [ super(Site, self)._download(request_dict=request_dict)] self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']")) return html_tree_list else: logger.info("Running Selenium browser PhantomJS...") driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) # Get a screenshot in testing # driver.save_screenshot('out.png') # Set the cookie self.cookies = normalize_cookies(driver.get_cookies()) WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']")) ) if self.court_name == 'sc': # Supreme Court is checked by default, so we don't want to # check it again. pass else: search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format( court_nr=self.courts['sc']) ) if search_supreme_court.is_selected(): ActionChains(driver).click(search_supreme_court).perform() search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format( court_nr=self.courts[self.court_name]) ) ActionChains(driver).click(search_court_type).perform() search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0") search_opinions.click() search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1") search_orders.click() start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput") start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y")) end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput") end_date.send_keys(self.case_date.strftime("%m/%d/%Y")) #driver.save_screenshot('%s.png' % self.case_date) submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText") submit.click() WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments")) ) self.status = 200 # driver.save_screenshot('out3.png') nr_of_pages = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]') records_nr = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]') html_pages = [] if records_nr: self.records_nr = int(records_nr.text) if nr_of_pages: if nr_of_pages.text == '1': text = driver.page_source driver.quit() html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) else: logger.info("Paginating through %s pages of results." % nr_of_pages.text) logger.info(" Getting page 1") text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) for i in range(int(nr_of_pages.text) - 1): logger.info(" Getting page %s" % (i + 2)) next_page = driver.find_element_by_class_name('rgPageNext') next_page.click() driver.implicitly_wait(5) text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) driver.quit() return html_pages
def _download(self, request_dict={}): if self.method == 'LOCAL': html_tree_list = [ super(Site, self)._download(request_dict=request_dict)] self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']")) return html_tree_list else: logger.info("Running Selenium browser PhantomJS...") driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) # Get a screenshot in testing # driver.save_screenshot('out.png') # Set the cookie self.cookies = normalize_cookies(driver.get_cookies()) driver.implicitly_wait(10) if self.court_name == 'sc': # Supreme Court is checked by default, so we don't want to # check it again. pass else: search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format( court_nr=self.courts[self.court_name]) ) search_court_type.click() search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0") search_opinions.click() search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1") search_orders.click() start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput") start_date.send_keys((self.case_date - timedelta(days=5)).strftime("%m/%d/%Y")) end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput") end_date.send_keys(self.case_date.strftime("%m/%d/%Y")) # driver.save_screenshot('out2.png') submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText") submit.click() driver.implicitly_wait(20) # driver.save_screenshot('out3.png') nr_of_pages = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]') records_nr = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]') if records_nr: self.records_nr = int(records_nr.text) if nr_of_pages: if nr_of_pages.text == '1': text = driver.page_source driver.close() html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) return html_tree else: html_pages = [] text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) for i in xrange(int(nr_of_pages.text) - 1): next_page = driver.find_element_by_class_name('rgPageNext') next_page.click() driver.implicitly_wait(5) text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) driver.close() return html_pages
def _download(self, request_dict={}): if self.method == 'LOCAL': html_tree_list = [ super(Site, self)._download(request_dict=request_dict) ] self.records_nr = len(html_tree_list[0].xpath( "//tr[@class='rgRow' or @class='rgAltRow']")) return html_tree_list else: logger.info("Running Selenium browser PhantomJS...") driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) # Get a screenshot in testing # driver.save_screenshot('out.png') # Set the cookie self.cookies = normalize_cookies(driver.get_cookies()) driver.implicitly_wait(10) if self.court_name == 'sc': # Supreme Court is checked by default, so we don't want to # check it again. pass else: search_court_type = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}". format(court_nr=self.courts[self.court_name])) search_court_type.click() search_opinions = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_chkListDocTypes_0") search_opinions.click() search_orders = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_chkListDocTypes_1") search_orders.click() start_date = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput") start_date.send_keys( (self.case_date - timedelta(days=5)).strftime("%m/%d/%Y")) end_date = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput") end_date.send_keys(self.case_date.strftime("%m/%d/%Y")) # driver.save_screenshot('out2.png') submit = driver.find_element_by_id( "ctl00_ContentPlaceHolder1_btnSearchText") submit.click() driver.implicitly_wait(20) # driver.save_screenshot('out3.png') nr_of_pages = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]' ) records_nr = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]' ) if records_nr: self.records_nr = int(records_nr.text) if nr_of_pages: if nr_of_pages.text == '1': text = driver.page_source driver.close() html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) return html_tree else: html_pages = [] text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) for i in xrange(int(nr_of_pages.text) - 1): next_page = driver.find_element_by_class_name( 'rgPageNext') next_page.click() driver.implicitly_wait(5) text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) driver.close() return html_pages
def _download(self, request_dict={}): self.request_dict = request_dict if self.method == 'LOCAL': html_tree_list = [ super(Site, self)._download(request_dict=request_dict)] self.records_nr = len(html_tree_list[0].xpath("//tr[@class='rgRow' or @class='rgAltRow']")) return html_tree_list else: logger.info("Running Selenium browser PhantomJS...") driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) # Get a screenshot in testing # driver.save_screenshot('out.png') # Set the cookie self.cookies = normalize_cookies(driver.get_cookies()) WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, "//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_15']")) ) if self.court_name == 'sc': # Supreme Court is checked by default, so we don't want to # check it again. pass else: search_supreme_court = driver.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}']".format( court_nr=self.courts['sc']) ) if search_supreme_court.is_selected(): ActionChains(driver).click(search_supreme_court).perform() search_court_type = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListCourts_{court_nr}".format( court_nr=self.courts[self.court_name]) ) ActionChains(driver).click(search_court_type).perform() search_opinions = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_0") search_opinions.click() search_orders = driver.find_element_by_id("ctl00_ContentPlaceHolder1_chkListDocTypes_1") search_orders.click() start_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentFrom_dateInput") start_date.send_keys((self.case_date - timedelta(days=self.backwards_days)).strftime("%m/%d/%Y")) end_date = driver.find_element_by_id("ctl00_ContentPlaceHolder1_dtDocumentTo_dateInput") end_date.send_keys(self.case_date.strftime("%m/%d/%Y")) # driver.save_screenshot('%s.png' % self.case_date) submit = driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchText") submit.click() WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_grdDocuments")) ) self.status = 200 # driver.save_screenshot('out3.png') nr_of_pages = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[2]') records_nr = driver.find_element_by_xpath( '//thead//*[contains(concat(" ", normalize-space(@class), " "), " rgInfoPart ")]/strong[1]') html_pages = [] if records_nr: self.records_nr = int(records_nr.text) if nr_of_pages: if nr_of_pages.text == '1': text = driver.page_source driver.quit() html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) else: logger.info("Paginating through %s pages of results." % nr_of_pages.text) logger.info(" Getting page 1") text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) for i in range(int(nr_of_pages.text) - 1): logger.info(" Getting page %s" % (i + 2)) next_page = driver.find_element_by_class_name('rgPageNext') next_page.click() driver.implicitly_wait(5) text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) driver.quit() return html_pages
def _download(self, request_dict={}): if self.method == 'LOCAL': html_tree_list = [ super(Site, self)._download(request_dict=request_dict)] self.records_nr = len(html_tree_list[0].xpath( "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]") ) return html_tree_list else: logger.info("Running Selenium browser PhantomJS...") driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log ) driver.set_window_size(1920, 1080) driver.get(self.url) # Get a screenshot in testing # driver.save_screenshot('out.png') # Set the cookie self.cookies = normalize_cookies(driver.get_cookies()) # driver.save_screenshot('screenie.png') WebDriverWait(driver, 15).until( EC.presence_of_element_located( (By.ID, "ctl00_Body_C010_ctl00_ctl00_endDate_dateInput") ) ) start_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_startDate_dateInput") start_date.send_keys((self.case_date - timedelta(days=self.interval)).strftime("%m/%d/%Y")) end_date = driver.find_element_by_id("ctl00_Body_C010_ctl00_ctl00_endDate_dateInput") end_date.send_keys(self.case_date.strftime("%m/%d/%Y")) #driver.save_screenshot('%s.png' % self.case_date) submit = driver.find_element_by_id("Body_C010_ctl00_ctl00_btnSearch") submit.click() WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.ID, "ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00")) ) self.status = 200 # driver.save_screenshot('%s.png' % self.case_date) try: nr_of_pages = driver.find_element_by_xpath( '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[2]') records_nr = driver.find_element_by_xpath( '//div[contains(concat(" ", @class, " "), "rgInfoPart")]/strong[1]') self.records_nr = int(records_nr.text) nr_of_pages = int(nr_of_pages.text) except NoSuchElementException: try: self.records_nr = len(driver.find_elements_by_xpath( "//tr[contains(concat('', @id, ''), 'ctl00_Body_C010_ctl00_ctl00_radGridOpinions_ctl00')]") ) nr_of_pages = 1 except NoSuchElementException: driver.quit() return [] html_pages = [] logger.info("records: {}, pages: {}".format(self.records_nr, nr_of_pages)) if nr_of_pages == 1: text = driver.page_source driver.quit() html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) else: logger.info("Paginating through %s pages of results." % nr_of_pages) logger.info(" Getting page 1") text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) for i in xrange(nr_of_pages - 1): logger.info(" Getting page %s" % (i + 2)) next_page = driver.find_element_by_class_name('rgPageNext') next_page.click() driver.implicitly_wait(5) text = driver.page_source html_tree = html.fromstring(text) html_tree.make_links_absolute(self.url) remove_anchors = lambda url: url.split('#')[0] html_tree.rewrite_links(remove_anchors) html_pages.append(html_tree) driver.quit() return html_pages