def _get_download_urls(self): """Links from the root page go to a second page where the real links are posted. """ def fetcher(seed_url): if self.method == 'LOCAL': return "No case names fetched during tests." else: """Goes to second page, grabs the link and returns it.""" r = requests.get( seed_url, allow_redirects=False, headers={'User-Agent': 'Juriscraper'}, verify=certifi.where(), ) r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) path_to_audio_file = "//*[@class='padboxauto_MediaContent']//a/@href" try: url = html_tree.xpath(path_to_audio_file)[0] except IndexError: # The URL wasn't found, so something is wrong and we'll have to # fix it in the _post_parse() method. url = '' return url path = "//tr[@class='dg_tr']/td[6]//@href" seed_urls = self.html.xpath(path) return DeferringList(seed=seed_urls, fetcher=fetcher)
def _get_download_urls(self): """We use a fetcher and a DeferringList object and a HEAD request to test whether the wpd exists for a case""" def fetcher(html_link): if self.test_mode_enabled(): return html_link # Can't fetch remote during tests case_number = re.search(r"(\d+)", html_link).group(0) wpd_link = "http://www.ndcourts.gov/wp/%s.wpd" % case_number r = requests.head( wpd_link, allow_redirects=False, headers={"User-Agent": "Juriscraper"}, ) if r.status_code == 200: return wpd_link else: return html_link if self.crawl_date >= date(1998, 10, 1): path = '//a/@href[contains(., "/court/opinions/")]' seed = list(self.html.xpath(path)) else: path = "//ul//a[text()]/@href" seed = list(self.html.xpath(path)) return DeferringList(seed=seed, fetcher=fetcher)
def _get_summaries(self): def fetcher(url): r = requests.get(url, allow_redirects=False, headers={'User-Agent': 'Juriscraper'}) # Throw an error if a bad status code is returned. r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) path = '//p[contains(@style, "justify")]/span[@style="font-weight: bold" ]/../following-sibling::p[not(contains(@style, "justify"))][position()=2]/following-sibling::p' summary_string = "" for e in html_tree.xpath(path): s = html.tostring(e, method='html', encoding='unicode') summary_string += s return get_clean_body_content(summary_string, remove_extra_tags=['span']) path = "//td[@class='center']/table[3]//tr/td[6]/div/a/@href" seed_urls = self.html.xpath(path) if seed_urls: return DeferringList(seed=seed_urls, fetcher=fetcher) else: return []
def _get_case_names(self): """The case names on the main page only show the first half of long case names. As a result, we browse to the pages they link to and compile those pages using Selenium and PhantomJS. Normally we wouldn't do the compilation step, but, alas, these pages put all their data into JavaScript functions, where are then executed to create the page. A couple other notes: 1. When developing, if you stop this after dirver.get(), you can get the content of the page by doing this: https://stackoverflow.com/questions/22739514 """ def fetcher(html_link): if self.method == 'LOCAL': return "No case names fetched during tests." else: full_url = 'http://2.alalinc.net/library/view/file/?lib=SUPREME&file={seed}'.format( seed=html_link) driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path. devnull, # Disable ghostdriver.log ) r = requests.get( full_url, headers={'User-Agent': 'Juriscraper'}, cookies=self._cookies, ) r.raise_for_status() # Create a fake HTML page from r.text that can be requested by # selenium. See: https://stackoverflow.com/questions/24834838/ driver.get('data:text/html,' + r.text) case_name = driver.find_element_by_xpath( "//table[contains(descendant::text(), 'Description')]//tr[2]" ).text case_name = ' '.join(case_name.split()) case_name = case_name.split('(')[0] case_name = case_name.split('PETITION')[0] return case_name seed = list( self.html.xpath( "//value[2]/text()[not(contains(../../value[7]/text(), 'list of decisions'))]" )) logger.info( "Getting {count} pages and rendering them using Selenium browser PhantomJS..." .format(count=len(seed))) return DeferringList(seed=seed, fetcher=fetcher)
def _get_case_names(self): def fetcher(e): """This reaches out to a secondary system and scrapes the correct info. """ if self.method == 'LOCAL': return "No case names fetched during tests." else: url = 'http://162.114.92.78/dockets/SearchCaseDetail.asp' anchor_text = html.tostring(e, method='text', encoding='unicode') m = self.docket_number_regex.search(anchor_text) r = requests.post( url, headers={'User-Agent': 'Juriscraper'}, data={ 'txtyear': m.group('year'), 'txtcasenumber': m.group('docket_num').strip('0'), 'cmdnamesearh': 'Search', }, ) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) # And finally, we parse out the good stuff. parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()" case_name_parts = [] for s in html_tree.xpath(parties_path): if s.strip(): case_name_parts.append(titlecase(s.strip().lower())) if len(case_name_parts) == 2: break return ' v. '.join(case_name_parts) # Get the docket numbers to use for queries. path = "//a[@href[contains(., '{m}')]]".format(m=self.hrefs_contain) elements = filter(self._has_valid_docket_number, self.html.xpath(path)) return DeferringList(seed=elements, fetcher=fetcher)
def _get_case_names(self): def fetcher(url): if self.method == 'LOCAL': return "No case names fetched during tests." else: r = requests.get( url, allow_redirects=True, headers={'User-Agent': 'Juriscraper'}, verify=certifi.where(), ) r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) plaintiff = '' defendant = '' try: plaintiff = html_tree.xpath( "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] defendant = html_tree.xpath( "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] except IndexError: logger.warn( "No title or defendant found for {}".format(url)) if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff) seed_urls = [] for html_tree in self.html: page_records_count = self._get_opinion_count(html_tree) for record in range(page_records_count): path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format( n=record) seed_urls.append(html_tree.xpath(path)[0]) if seed_urls: return DeferringList(seed=seed_urls, fetcher=fetcher) else: return []
def _get_case_names(self): def fetcher(url): if self.method == 'LOCAL': return "No case names fetched during tests." else: r = requests.get( url, allow_redirects=False, headers={'User-Agent': 'Juriscraper'}, ) r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) plaintiff = html_tree.xpath( "//text()[contains(., 'Style')]/ancestor::tr[1]/td[2]/text()" )[0] defendant = html_tree.xpath( "//text()[contains(., 'v.:')]/ancestor::tr[1]/td[2]/text()" )[0] if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff) seed_urls = [] if isinstance(self.html, list): for html_tree in self.html: page_records_count = self._get_opinion_count(html_tree) for record in xrange(page_records_count): path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format( n=record) seed_urls.append(html_tree.xpath(path)[0]) else: seed_urls = map(self._return_seed_url, range(self.records_nr)) if seed_urls: return DeferringList(seed=seed_urls, fetcher=fetcher) else: return []
def _get_download_urls(self): """Links from the root page go to a second page where the real links are posted. """ def fetcher(seed_url): if self.method == 'LOCAL': return "No case names fetched during tests." else: # Goes to second page, grabs the link and returns it. html_tree = self._get_html_tree_by_url(seed_url) path_to_audio_file = "//*[@class='padboxauto_MediaContent']//a/@href" try: url = html_tree.xpath(path_to_audio_file)[0] except IndexError: # The URL wasn't found, so something is wrong and we'll have to # fix it in the _post_parse() method. url = '' return url path = "//tr[@class='dg_tr']/td[6]//@href" seed_urls = self.html.xpath(path) return DeferringList(seed=seed_urls, fetcher=fetcher)
def _get_case_names(self): def fetcher(url): if self.method == 'LOCAL': return "No case names fetched during tests." else: html_tree = self._get_html_tree_by_url(url, self.request_dict) plaintiff = '' defendant = '' try: plaintiff = html_tree.xpath( "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] defendant = html_tree.xpath( "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] except IndexError: logger.warn("No title or defendant found for {}".format(url)) if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff) seed_urls = [] for html_tree in self.html: page_records_count = self._get_opinion_count(html_tree) for record in range(page_records_count): path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format( n=record ) seed_urls.append(html_tree.xpath(path)[0]) if seed_urls: return DeferringList(seed=seed_urls, fetcher=fetcher) else: return []
html_tree = html.fromstring(text) # And finally, we parse out the good stuff. parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()" case_name_parts = [] for s in html_tree.xpath(parties_path): if s.strip(): case_name_parts.append(titlecase(s.strip().lower())) if len(case_name_parts) == 2: break return ' v. '.join(case_name_parts) # Get the docket numbers to use for queries. path = "//a[@href[contains(., '{m}')]]".format(m=self.hrefs_contain) elements = filter(self._has_valid_docket_number, self.html.xpath(path)) return DeferringList(seed=elements, fetcher=fetcher) def _get_docket_numbers(self): path = "//a[@href[contains(., '{m}')]]".format( m=self.hrefs_contain) elements = filter(self._has_valid_docket_number, self.html.xpath(path)) return map(self._return_docket_number_from_str, elements) def _has_valid_docket_number(self, e): text = html.tostring(e, method='text', encoding='unicode') if self.docket_number_regex.search(text): return True else: return False def _return_docket_number_from_str(self, e):