def extract_date_summary_from_link(self, link): # Link should be within a <p> tag directly under <div id='maincontent'>, but # occasionally the courts forgets the wrap it in a <p>, in which case it should # be directly under the <div id='maincontent'> container_id = "maincontent" parent = link.getparent() parents_parent = parent.getparent() if "id" in parent.attrib and parent.attrib["id"] == container_id: search_root = link elif ("id" in parents_parent.attrib and parents_parent.attrib["id"] == container_id): search_root = parent else: raise InsanityException( 'Unrecognized placement of Opinion url on page: "%s"' % link.text_content().strip()) # Find date from bolded header element above link (ex: "5-14-2014 - Opinions" or "5-21-2014 - Orders") element_date = search_root.xpath("./preceding-sibling::b")[-1] element_date_text = element_date.text_content().strip().lower() try: date_string = element_date_text.split()[0] except: raise InsanityException('Unrecognized bold (date) element: "%s"' % element_date_text) # Find summary from blockquote element below link element_blockquote = search_root.xpath( "./following-sibling::blockquote")[0] summary = element_blockquote.text_content().strip() return convert_date_string(date_string), summary
def _fetch_case_name(self, case_number): """Fetch case name for a given docket number + publication year pair. Some resources show 'Public Access Restricted' messages and do not provide parseable case name information. These will be skipped by our system by returning False below. The only other approach would be to parse the case name from the raw PDF text itself. """ # If case_number is not expected 12 characters, skip it, since # we can't know how to fix the courts typo. They likely forgot # to '0' pad the beginning or the end of the 'number' suffix, # but we can't know for sure. if len(case_number) != 12: return False # Site has non-chained, bad certificate, need to # ignore ssl verification for now for scraper to work self.request['verify'] = False url = 'https://appellate.kycourts.net/SC/SCDockets/CaseDetails.aspx?cn=%s' % case_number html = self._get_html_tree_by_url(url) # Halt if there is a (dismissible) error/warning on the page path_error_warning = '//div[contains(@class, "alert-dismissible")]' if html.xpath(path_error_warning): raise InsanityException('Invalid sub-resource url (%s). Is case number (%s) invalid?' % (url, case_number)) # Ensure that only two substrings are present path_party = '//td[@class="party"]/text()' parties = html.xpath(path_party) if len(parties) != 2: raise InsanityException('Unexpected party elements. Expected two substrings, got: %s' % ', '.join(parties)) return titlecase(' v. '.join(parties))
def parse_name_from_text(text_list): regexes = [ # Expected format "(.*?)(,?\sNos?\.)(.*?)", # Clerk typo, forgot "No."/"Nos." substring "(.*?)(,?\s\d+-\d+(,|\s))(.*?)", # Same as above, and there's an unconventional docket number # like 'SU-14-324' instead of '14-324'. See ri_p_example_4.html "(.*?)(,?\s(?:\w+-)?\d+-\d+(,|\s))(.*?)", ] for regex in regexes: for text in text_list: name_match = re.match(regex, text) if name_match: return name_match.group(1) # "No."/"Nos." and docket missing, fall back on whatever's before first # semi-colon for text in text_list: if ";" in text: return text.split(";")[0] raise InsanityException('Could not parse name from string: "%s"' % text_list)
def _post_parse(self): """Unfortunately, some of the items do not have audio files despite appearing in the table and having a link to a supplementary audio page. For these items, we set the download_url to '' and this method finds the related information for those items and then removes it from all the other attributes for the Site object. """ # Start by checking sanity. This will make sure we don't mess things # up. If this sanity check fails, we'll know things were messed up # before we began tinkering with them. self._check_sanity() # Items are purged in two steps. First, we identify the index of the # items that need purging. purge_indexes = [] for i, url in enumerate(self.download_urls): if not url: purge_indexes.append(i) # Quick check: We did find *some* urls, right? if len(purge_indexes) == len(self.download_urls): raise InsanityException("Didn't get any download URLs. Looks like " "something is wrong in the _post_parse() " "method.") # Second, we purge them, beginning at the end and moving forwards. This # ensures that we don't delete the wrong items. for index_to_purge in sorted(purge_indexes, reverse=True): for attr in self._all_attrs: item = getattr(self, attr) if item is not None: # If we've added stuff to it, then delete the key. del item[index_to_purge]
def _extract_name_from_text(cls, text): text = text.strip() try: match = re.match(cls.regex, text).group(12) except: raise InsanityException('Unable to parse case name from "%s"' % text) return match.strip().rstrip('.')
def _process_html(self): for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'): text = clean_string(item.text_content()) date_string = " ".join(text.split()[0:3]) try: convert_date_string(date_string) except: raise InsanityException('Unexpected text format: "%s"' % text) docket_name = text.replace(date_string, "").strip().lstrip("-") # sometimes the records include a docket number(s) as the # first words in the second half of the hyphenated string, # but some don't include a docket at all. So we test to see # if the first word is numeric (minus the slash characters # used to conjoin multiple docket numbers). docket, name = docket_name.split(None, 1) first_word = docket[0].replace("/", "") if not first_word.isnumeric(): docket = "" name = docket_name self.cases.append( { "date": date_string, "docket": docket, "name": name, "url": item.xpath(".//a/@href")[0], } )
def _process_html(self): for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'): text = clean_string(item.text_content()) text_parts = text.split('-', 1) if len(text_parts) != 2: raise InsanityException('Unexpected text format: "%s"' % text) # sometimes the records include a docket number(s) as the # first words in the second half of the hyphenated string, # but some don't include a docket at all. So we test to see # if the first word is numeric (minus the slash characters # used to conjoin multiple docket numbers). docket_name = text_parts[1].split(None, 1) first_word = docket_name[0].replace('/', '') if first_word.isnumeric(): docket = docket_name[0] name = docket_name[1] else: docket = '' name = text_parts[1] self.cases.append({ 'date': text_parts[0], 'docket': docket, 'name': name, 'url': item.xpath('.//a/@href')[0], })
def set_table_headers(self, html): # Do nothing if table is missing if html.xpath(self.path_table): path = '%s//th' % self.path_table self.headers = [cell.text_content().strip() for cell in html.xpath(path)] # Ensure that expected/required headers are present if not set(self.required_headers).issubset(self.headers): raise InsanityException('Required table column missing')
def _extract_docket_from_text(cls, text): try: match = re.match(cls.regex, text).group(6) except: raise InsanityException('Unable to parse docket from "%s"' % text) dockets_raw = match.rstrip('.').replace('&', ' ').replace(',', ' ') dockets = dockets_raw.split() return ', '.join(dockets)
def _extract_docket_from_text(cls, text): text = text.strip() try: match = re.match(cls.regex, text).group(6) except: raise InsanityException('Unable to parse docket from "%s"' % text) dockets_raw = match.rstrip(".").replace("&", " ").replace(",", " ") dockets = dockets_raw.split() return ", ".join(dockets)
def return_opinion_path(self): paths = [ '//select/option[contains(@value, ".pdf")]', '//ul/li/a[contains(@href, ".pdf")]', ] for path in paths: if self.html.xpath(path): return path raise InsanityException("No recognized path to opinion listings")
def extract_year_from_h1(self, html): """For testability with example files from previous years, we can't use the current year in the base_path search, and instead need to extract the year from the pages <h1> tag. This is also handy early in the calendar year if/when the court is publishing new opinions for the end of the previous year """ year_string = html.xpath('//h1')[0].text_content().split()[3] # Basic validation of year if len(year_string) != 4 or not year_string.startswith('20'): raise InsanityException( 'Extracted year "%s") appears to be invalid' % year_string) # If running live scrape, year should always be this year or last year if self.method != 'LOCAL': this_year = datetime.today().year if int(year_string) not in [this_year, this_year - 1]: raise InsanityException( 'Year ("%s") too far in the future or past' % year_string) return year_string
def _get_anchor_docket_name_pairs(self): """The court has some ugly HTML practices that we need to handle. Most anchor links include single line strings with a single docket number and a single case name. However, there are two other formats we've seen and must work around. (CASE 1) The anchor has multiple lines broken with <br> tag(s), and each line contains "<docket> <name>". In this case we need to combine the docket numbers and name strings respectively. [EXAMPLE: February 18, 2016 in nh_example_2.html] (CASE 2) The anchor has multiple lines broken with <br> tag(s), and the second line is a continuation of a long case name started on the first line. So, the second line does not lead with a docket number, thus this line's string should be glued onto the <name> substring extracted from the previous line. [EXAMPLE: September 18, 2018 in nh_example_6.html] """ pairs = [] for anchor in self.html.xpath(self.link_path): i = 0 dockets = [] name_substrings = [] text_anchor = anchor.text_content() text_clean = text_anchor.replace("\n", "") for text in text_clean.split(";"): text = clean_string(text) match = self.link_text_regex.search(text) try: docket = match.group(1) dockets.append(docket) name = match.group(2) name = " ".join(name.split()) name_substrings.append(name) i += 1 except AttributeError: if i == 0: # docket and name (root) should be contained in first substring error = "Invalid anchor root string format: %s" % text raise InsanityException(error) # no docket in the substring, its a trailing name substring # that they broke over multiple lines, so glue it to the # previous name substring name_substrings[i - 1] += " %s" % text pairs.append({ "docket": ", ".join(dockets), "name": " and ".join(name_substrings), }) return pairs
def return_section_path(self): paths = [ '//div[contains(@class, "panel-default")]', '//td[contains(p/@class, "center")]', '//td[contains(p/@align, "center")]', '//td[contains(h2/@class, "center")]', '//div[contains(h3/@class, "center")]', '//div[contains(h3/@align, "center")]', ] for path in paths: if self.html.xpath(path): return path raise InsanityException("No recognized path to opinion sections")
def _download(self, request_dict={}): """This is another of the cursed MS asp.net pages with damned POST parameters like __EVENTVALIDATION. These are near impossible to scrape without using Selenium. """ if self.method == 'LOCAL': # This is an arbitrary date that we need to set # for our compar.json test to pass self.case_date = convert_date_string('2017-08-13') return super(Site, self)._download(request_dict=request_dict) else: driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/phantomjs', service_log_path=os.path.devnull, # Disable ghostdriver.log # Without these args, when you get self.url, you'll still be at # about:config because the SSL on this site is so terrible. service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=tlsv1' ], ) driver.implicitly_wait(30) logger.info("Now downloading case page at: %s" % self.url) driver.get(self.url) # Select the correct drop downs, then submit. path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format( type=self.opinion_type) driver.find_element_by_xpath(path_to_opinion_type).click() path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format( d=self.release_date) try: driver.find_element_by_xpath(path_to_date).click() except NoSuchElementException: # This is not uncommon early in the month (or if there are # no opinions published in the current month), so failures # resulting from this raise can probably be ignored. warning = 'Current month (%s) not yet available in portal--common occurrence early in the month.' raise InsanityException(warning % self.release_date) path_to_submit = "//input[@id='cmdSearch']" driver.find_element_by_xpath(path_to_submit).click() # Selenium doesn't give us the actual code, we have to hope. self.status = 200 text = self._clean_text(driver.page_source) html_tree = html.fromstring(text) html_tree.rewrite_links(fix_links_in_lxml_tree, base_href=self.request['url']) return html_tree
def parse_date_from_text(self, text_list): regex = "(.*?)(\((\w+\s+\d+\,\s+\d+)\))(.*?)" for text in text_list: date_match = re.match(regex, text) if date_match: return date_match.group(3) # Fall back on previous case's date if self.previous_date: return self.previous_date raise InsanityException("Could not parse date from string, and no " 'previous date to fall back on: "%s"' % text_list)
def return_year_sub_path(self): parent = self.html.xpath(self.section_path)[0] paths = [ './div[contains(@class, "panel-heading")]/label', './p[contains(@class, "center")]/strong', './p[contains(@align, "center")]/font/b', './h2[contains(@class, "center")]', './h3[contains(@class, "center")]', './h3[contains(@align, "center")]', ] for path in paths: if parent.xpath(path): return path raise InsanityException("No recognized path to year string")
def _get_case_dates(self): dates = [] for item in self.html.xpath('//span[@class="feed-item-date"]'): text = item.text_content().strip() words = text.split() if len(words) == 2: date = convert_date_string(words[1]) elif 'ago' in text: # The record was added today "X hours and Y min ago" date = self.today else: raise InsanityException( 'Unrecognized date element string: %s' % text) dates.append(date) return dates
def parse_title(txt): try: name_and_citation = txt.rsplit("(", 1)[0].strip() docket_number = (re.search("(.*\d).*?", txt.rsplit("(", 1)[1]).group(0).strip()) case_name = name_and_citation.rsplit(",", 1)[0].strip() try: neutral_cite = name_and_citation.rsplit(",", 1)[1].strip() if not re.search("^\d\d.*\d\d$", neutral_cite): neutral_cite = "" except IndexError: # Unable to find comma to split on. No neutral cite. neutral_cite = "" except: raise InsanityException("Unable to parse: %s\n%s" % (txt, traceback.format_exc())) return case_name, neutral_cite, docket_number
def _process_html(self): paths = "//p/strong | //p/b | //p/font/strong | //p/font/b" for date_element in self.html.xpath(paths): string = date_element.xpath("./text()") try: string = string[0] # handle examples where time but no date (ga_example_3.html) if ":" in string and ("AM" in string or "PM" in string): continue # handle legacy example (ga_example.html) string = string.split("SUMMARIES")[0] date_string = re.sub(r"\W+", " ", string) # handle legacy example (ga_example.html) if len(date_string.split()) != 3: continue except: continue parent = date_element.xpath("./..")[0] # handle legacy example (ga_example.html) while parent.tag != "p": parent = parent.xpath("./..")[0] for item in parent.getnext().xpath("./li"): text = item.text_content() if text: # Extract Docket numbers dockets = re.findall(self.regex_docket, text) if not dockets: raise InsanityException( "Could not find docket numbers in: 's'" % text) # Extract name substring; I am sure this could # be done with a more slick regex, but its not # my forte... name = text for docket in dockets: name = name.replace(docket, "") name = name.lstrip(" .,") self.cases.append({ "date": date_string, "docket": ", ".join(dockets), "name": titlecase(name.lstrip(" .,")), "url": item.xpath(".//a[1]/@href")[0], })
def get_absolute_opinion_path(self, suffix, type): """Determine the absolute path given the file suffix in the json object and the opinion type. This is necessary because the course does not return standardized data objects. """ type_parts = type.lower().split("_") type_parts_length = len(type_parts) if type_parts_length == 1: status = False type = type_parts[0].lower() elif type_parts_length == 2: status = type_parts[0].lower() type = type_parts[1].lower() else: raise InsanityException( 'Unrecognized type "%s", this should never ' "happen" % type) if not suffix.startswith(type): if status: suffix = "%s/%s/%s" % (type, status, suffix) else: suffix = "%s/%s" % (type, suffix) return "%s/opinions/%s" % (self.base_url, suffix)
def return_url_path(self): if "/option" in self.opinion_path: return "%s/@value" % self.opinion_path elif "/li/a" in self.opinion_path: return "%s/@href" % self.opinion_path raise InsanityException("No recognized path to url")
def _check_sanity(self): """Check that the objects attributes make sense: 1. Do all the attributes have the same length? 1. Do we have any content at all? 1. Is there a bare minimum of meta data? 1. Are the dates datetime objects, not strings? 1. Are any dates from the 22nd century? (01-01-2104) 1. Are case_names more than just empty whitespace? 1. Has the `cookies` attribute been normalized to a dict? 1. ? The signature of this method is subject to change as additional checks become convenient. Inheriting classes should override this method calling super to give it the necessary parameters. If sanity is OK, no return value. If not, throw InsanityException or warnings, as appropriate. """ lengths = {} for attr in self._all_attrs: if self.__getattribute__(attr) is not None: lengths[attr] = len(self.__getattribute__(attr)) values = list(lengths.values()) if values.count(values[0]) != len(values): # Are all elements equal? raise InsanityException( "%s: Scraped meta data fields have differing" " lengths: %s" % (self.court_id, lengths) ) if len(self.case_names) == 0: logger.warning("%s: Returned with zero items." % self.court_id) else: for field in self._req_attrs: if self.__getattribute__(field) is None: raise InsanityException( "%s: Required fields do not contain any data: %s" % (self.court_id, field) ) i = 0 prior_case_name = None for name in self.case_names: if not name.strip(): raise InsanityException( "Item with index %s has an empty case name. The prior " "item had case name of: %s" % (i, prior_case_name) ) prior_case_name = name i += 1 for index, case_date in enumerate(self.case_dates): if not isinstance(case_date, date): raise InsanityException( "%s: member of case_dates list not a valid date object. " "Instead it is: %s with value: %s" % (self.court_id, type(case_date), case_date) ) # Sanitize case date, fix typo of current year if present fixed_date = fix_future_year_typo(case_date) if fixed_date != case_date: logger.info( "Date year typo detected. Converting %s to %s " "for case '%s' in %s" % ( case_date, fixed_date, self.case_names[index], self.court_id, ) ) case_date = fixed_date self.case_dates[index] = fixed_date if case_date.year > 2025: raise InsanityException( "%s: member of case_dates list is from way in the future, " "with value %s" % (self.court_id, case_date.year) ) # Is cookies a dict? if type(self.cookies) != dict: raise InsanityException( "self.cookies not set to be a dict by " "scraper." ) logger.info( "%s: Successfully found %s items." % (self.court_id, len(self.case_names)) )
def return_url_path(self): if '/option' in self.opinion_path: return '%s/@value' % self.opinion_path elif '/li/a' in self.opinion_path: return '%s/@href' % self.opinion_path raise InsanityException('No recognized path to url')