Пример #1
0
    def extract_date_summary_from_link(self, link):
        # Link should be within a <p> tag directly under <div id='maincontent'>, but
        # occasionally the courts forgets the wrap it in a <p>, in which case it should
        # be directly under the <div id='maincontent'>
        container_id = "maincontent"
        parent = link.getparent()
        parents_parent = parent.getparent()
        if "id" in parent.attrib and parent.attrib["id"] == container_id:
            search_root = link
        elif ("id" in parents_parent.attrib
              and parents_parent.attrib["id"] == container_id):
            search_root = parent
        else:
            raise InsanityException(
                'Unrecognized placement of Opinion url on page: "%s"' %
                link.text_content().strip())

        # Find date from bolded header element above link (ex: "5-14-2014 - Opinions" or "5-21-2014 - Orders")
        element_date = search_root.xpath("./preceding-sibling::b")[-1]
        element_date_text = element_date.text_content().strip().lower()
        try:
            date_string = element_date_text.split()[0]
        except:
            raise InsanityException('Unrecognized bold (date) element: "%s"' %
                                    element_date_text)

        # Find summary from blockquote element below link
        element_blockquote = search_root.xpath(
            "./following-sibling::blockquote")[0]
        summary = element_blockquote.text_content().strip()

        return convert_date_string(date_string), summary
Пример #2
0
    def _fetch_case_name(self, case_number):
        """Fetch case name for a given docket number + publication year pair.

        Some resources show 'Public Access Restricted' messages and do not
        provide parseable case name information.  These will be skipped by
        our system by returning False below.  The only other approach would
        be to parse the case name from the raw PDF text itself.
        """

        # If case_number is not expected 12 characters, skip it, since
        # we can't know how to fix the courts typo. They likely forgot
        # to '0' pad the beginning or the end of the 'number' suffix,
        # but we can't know for sure.
        if len(case_number) != 12:
            return False

        # Site has non-chained, bad certificate, need to
        # ignore ssl verification for now for scraper to work
        self.request['verify'] = False

        url = 'https://appellate.kycourts.net/SC/SCDockets/CaseDetails.aspx?cn=%s' % case_number
        html = self._get_html_tree_by_url(url)

        # Halt if there is a (dismissible) error/warning on the page
        path_error_warning = '//div[contains(@class, "alert-dismissible")]'
        if html.xpath(path_error_warning):
            raise InsanityException('Invalid sub-resource url (%s). Is case number (%s) invalid?' % (url, case_number))

        # Ensure that only two substrings are present
        path_party = '//td[@class="party"]/text()'
        parties = html.xpath(path_party)
        if len(parties) != 2:
            raise InsanityException('Unexpected party elements. Expected two substrings, got: %s' % ', '.join(parties))

        return titlecase(' v. '.join(parties))
Пример #3
0
    def parse_name_from_text(text_list):
        regexes = [
            # Expected format
            "(.*?)(,?\sNos?\.)(.*?)",
            # Clerk typo, forgot "No."/"Nos." substring
            "(.*?)(,?\s\d+-\d+(,|\s))(.*?)",
            # Same as above, and there's an unconventional docket number
            # like 'SU-14-324' instead of '14-324'. See ri_p_example_4.html
            "(.*?)(,?\s(?:\w+-)?\d+-\d+(,|\s))(.*?)",
        ]

        for regex in regexes:
            for text in text_list:
                name_match = re.match(regex, text)
                if name_match:
                    return name_match.group(1)

        # "No."/"Nos." and docket missing, fall back on whatever's before first
        # semi-colon
        for text in text_list:
            if ";" in text:
                return text.split(";")[0]

        raise InsanityException('Could not parse name from string: "%s"' %
                                text_list)
Пример #4
0
    def _post_parse(self):
        """Unfortunately, some of the items do not have audio files despite
        appearing in the table and having a link to a supplementary audio page.

        For these items, we set the download_url to '' and this method finds
        the related information for those items and then removes it from all
        the other attributes for the Site object.
        """
        # Start by checking sanity. This will make sure we don't mess things
        # up. If this sanity check fails, we'll know things were messed up
        # before we began tinkering with them.
        self._check_sanity()

        # Items are purged in two steps. First, we identify the index of the
        # items that need purging.
        purge_indexes = []
        for i, url in enumerate(self.download_urls):
            if not url:
                purge_indexes.append(i)

        # Quick check: We did find *some* urls, right?
        if len(purge_indexes) == len(self.download_urls):
            raise InsanityException("Didn't get any download URLs. Looks like "
                                    "something is wrong in the _post_parse() "
                                    "method.")

        # Second, we purge them, beginning at the end and moving forwards. This
        # ensures that we don't delete the wrong items.
        for index_to_purge in sorted(purge_indexes, reverse=True):
            for attr in self._all_attrs:
                item = getattr(self, attr)
                if item is not None:
                    # If we've added stuff to it, then delete the key.
                    del item[index_to_purge]
Пример #5
0
 def _extract_name_from_text(cls, text):
     text = text.strip()
     try:
         match = re.match(cls.regex, text).group(12)
     except:
         raise InsanityException('Unable to parse case name from "%s"' % text)
     return match.strip().rstrip('.')
Пример #6
0
    def _process_html(self):
        for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'):
            text = clean_string(item.text_content())
            date_string = " ".join(text.split()[0:3])
            try:
                convert_date_string(date_string)
            except:
                raise InsanityException('Unexpected text format: "%s"' % text)
            docket_name = text.replace(date_string, "").strip().lstrip("-")

            # sometimes the records include a docket number(s) as the
            # first words in the second half of the hyphenated string,
            # but some don't include a docket at all.  So we test to see
            # if the first word is numeric (minus the slash characters
            # used to conjoin multiple docket numbers).
            docket, name = docket_name.split(None, 1)
            first_word = docket[0].replace("/", "")
            if not first_word.isnumeric():
                docket = ""
                name = docket_name

            self.cases.append(
                {
                    "date": date_string,
                    "docket": docket,
                    "name": name,
                    "url": item.xpath(".//a/@href")[0],
                }
            )
Пример #7
0
    def _process_html(self):
        for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'):
            text = clean_string(item.text_content())
            text_parts = text.split('-', 1)

            if len(text_parts) != 2:
                raise InsanityException('Unexpected text format: "%s"' % text)

            # sometimes the records include a docket number(s) as the
            # first words in the second half of the hyphenated string,
            # but some don't include a docket at all.  So we test to see
            # if the first word is numeric (minus the slash characters
            # used to conjoin multiple docket numbers).
            docket_name = text_parts[1].split(None, 1)
            first_word = docket_name[0].replace('/', '')
            if first_word.isnumeric():
                docket = docket_name[0]
                name = docket_name[1]
            else:
                docket = ''
                name = text_parts[1]

            self.cases.append({
                'date': text_parts[0],
                'docket': docket,
                'name': name,
                'url': item.xpath('.//a/@href')[0],
            })
Пример #8
0
 def set_table_headers(self, html):
     # Do nothing if table is missing
     if html.xpath(self.path_table):
         path = '%s//th' % self.path_table
         self.headers = [cell.text_content().strip() for cell in html.xpath(path)]
         # Ensure that expected/required headers are present
         if not set(self.required_headers).issubset(self.headers):
             raise InsanityException('Required table column missing')
Пример #9
0
 def _extract_docket_from_text(cls, text):
     try:
         match = re.match(cls.regex, text).group(6)
     except:
         raise InsanityException('Unable to parse docket from "%s"' % text)
     dockets_raw = match.rstrip('.').replace('&', ' ').replace(',', ' ')
     dockets = dockets_raw.split()
     return ', '.join(dockets)
Пример #10
0
 def _extract_docket_from_text(cls, text):
     text = text.strip()
     try:
         match = re.match(cls.regex, text).group(6)
     except:
         raise InsanityException('Unable to parse docket from "%s"' % text)
     dockets_raw = match.rstrip(".").replace("&", " ").replace(",", " ")
     dockets = dockets_raw.split()
     return ", ".join(dockets)
Пример #11
0
 def return_opinion_path(self):
     paths = [
         '//select/option[contains(@value, ".pdf")]',
         '//ul/li/a[contains(@href, ".pdf")]',
     ]
     for path in paths:
         if self.html.xpath(path):
             return path
     raise InsanityException("No recognized path to opinion listings")
Пример #12
0
 def extract_year_from_h1(self, html):
     """For testability with example files from previous years,
     we can't use the current year in the base_path search, and
     instead need to extract the year from the pages <h1> tag.
     This is also handy early in the calendar year if/when the
     court is publishing new opinions for the end of the previous
     year
     """
     year_string = html.xpath('//h1')[0].text_content().split()[3]
     # Basic validation of year
     if len(year_string) != 4 or not year_string.startswith('20'):
         raise InsanityException(
             'Extracted year "%s") appears to be invalid' % year_string)
     # If running live scrape, year should always be this year or last year
     if self.method != 'LOCAL':
         this_year = datetime.today().year
         if int(year_string) not in [this_year, this_year - 1]:
             raise InsanityException(
                 'Year ("%s") too far in the future or past' % year_string)
     return year_string
Пример #13
0
    def _get_anchor_docket_name_pairs(self):
        """The court has some ugly HTML practices that we need to handle.

        Most anchor links include single line strings with a single docket
        number and a single case name.  However, there are two other formats
        we've seen and must work around.

        (CASE 1)
        The anchor has multiple lines broken with <br> tag(s), and each
        line contains "<docket> <name>". In this case we need to combine
        the docket numbers and name strings respectively.
        [EXAMPLE: February 18, 2016 in nh_example_2.html]

        (CASE 2)
        The anchor has multiple lines broken with <br> tag(s), and the
        second line is a continuation of a long case name started on the first
        line.  So, the second line does not lead with a docket number, thus
        this line's string should be glued onto the <name> substring extracted
        from the previous line.
        [EXAMPLE: September 18, 2018 in nh_example_6.html]
        """
        pairs = []
        for anchor in self.html.xpath(self.link_path):
            i = 0
            dockets = []
            name_substrings = []
            text_anchor = anchor.text_content()
            text_clean = text_anchor.replace("\n", "")

            for text in text_clean.split(";"):
                text = clean_string(text)
                match = self.link_text_regex.search(text)
                try:
                    docket = match.group(1)
                    dockets.append(docket)
                    name = match.group(2)
                    name = " ".join(name.split())
                    name_substrings.append(name)
                    i += 1
                except AttributeError:
                    if i == 0:
                        # docket and name (root) should be contained in first substring
                        error = "Invalid anchor root string format: %s" % text
                        raise InsanityException(error)
                    # no docket in the substring, its a trailing name substring
                    # that they broke over multiple lines, so glue it to the
                    # previous name substring
                    name_substrings[i - 1] += " %s" % text
            pairs.append({
                "docket": ", ".join(dockets),
                "name": " and ".join(name_substrings),
            })
        return pairs
Пример #14
0
 def return_section_path(self):
     paths = [
         '//div[contains(@class, "panel-default")]',
         '//td[contains(p/@class, "center")]',
         '//td[contains(p/@align, "center")]',
         '//td[contains(h2/@class, "center")]',
         '//div[contains(h3/@class, "center")]',
         '//div[contains(h3/@align, "center")]',
     ]
     for path in paths:
         if self.html.xpath(path):
             return path
     raise InsanityException("No recognized path to opinion sections")
    def _download(self, request_dict={}):
        """This is another of the cursed MS asp.net pages with damned POST
          parameters like __EVENTVALIDATION. These are near impossible to
          scrape without using Selenium.
        """
        if self.method == 'LOCAL':
            # This is an arbitrary date that we need to set
            # for our compar.json test to pass
            self.case_date = convert_date_string('2017-08-13')
            return super(Site, self)._download(request_dict=request_dict)
        else:
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
                # Without these args, when you get self.url, you'll still be at
                # about:config because the SSL on this site is so terrible.
                service_args=[
                    '--ignore-ssl-errors=true', '--ssl-protocol=tlsv1'
                ],
            )
            driver.implicitly_wait(30)
            logger.info("Now downloading case page at: %s" % self.url)
            driver.get(self.url)

            # Select the correct drop downs, then submit.
            path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format(
                type=self.opinion_type)
            driver.find_element_by_xpath(path_to_opinion_type).click()
            path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format(
                d=self.release_date)

            try:
                driver.find_element_by_xpath(path_to_date).click()
            except NoSuchElementException:
                # This is not uncommon early in the month (or if there are
                # no opinions published in the current month), so failures
                # resulting from this raise can probably be ignored.
                warning = 'Current month (%s) not yet available in portal--common occurrence early in the month.'
                raise InsanityException(warning % self.release_date)

            path_to_submit = "//input[@id='cmdSearch']"
            driver.find_element_by_xpath(path_to_submit).click()

            # Selenium doesn't give us the actual code, we have to hope.
            self.status = 200

            text = self._clean_text(driver.page_source)
            html_tree = html.fromstring(text)
            html_tree.rewrite_links(fix_links_in_lxml_tree,
                                    base_href=self.request['url'])
        return html_tree
Пример #16
0
    def parse_date_from_text(self, text_list):
        regex = "(.*?)(\((\w+\s+\d+\,\s+\d+)\))(.*?)"
        for text in text_list:
            date_match = re.match(regex, text)
            if date_match:
                return date_match.group(3)

        # Fall back on previous case's date
        if self.previous_date:
            return self.previous_date

        raise InsanityException("Could not parse date from string, and no "
                                'previous date to fall back on: "%s"' %
                                text_list)
Пример #17
0
 def return_year_sub_path(self):
     parent = self.html.xpath(self.section_path)[0]
     paths = [
         './div[contains(@class, "panel-heading")]/label',
         './p[contains(@class, "center")]/strong',
         './p[contains(@align, "center")]/font/b',
         './h2[contains(@class, "center")]',
         './h3[contains(@class, "center")]',
         './h3[contains(@align, "center")]',
     ]
     for path in paths:
         if parent.xpath(path):
             return path
     raise InsanityException("No recognized path to year string")
Пример #18
0
 def _get_case_dates(self):
     dates = []
     for item in self.html.xpath('//span[@class="feed-item-date"]'):
         text = item.text_content().strip()
         words = text.split()
         if len(words) == 2:
             date = convert_date_string(words[1])
         elif 'ago' in text:
             # The record was added today "X hours and Y min ago"
             date = self.today
         else:
             raise InsanityException(
                 'Unrecognized date element string: %s' % text)
         dates.append(date)
     return dates
Пример #19
0
 def parse_title(txt):
     try:
         name_and_citation = txt.rsplit("(", 1)[0].strip()
         docket_number = (re.search("(.*\d).*?",
                                    txt.rsplit("(", 1)[1]).group(0).strip())
         case_name = name_and_citation.rsplit(",", 1)[0].strip()
         try:
             neutral_cite = name_and_citation.rsplit(",", 1)[1].strip()
             if not re.search("^\d\d.*\d\d$", neutral_cite):
                 neutral_cite = ""
         except IndexError:
             # Unable to find comma to split on. No neutral cite.
             neutral_cite = ""
     except:
         raise InsanityException("Unable to parse: %s\n%s" %
                                 (txt, traceback.format_exc()))
     return case_name, neutral_cite, docket_number
Пример #20
0
    def _process_html(self):
        paths = "//p/strong | //p/b | //p/font/strong | //p/font/b"
        for date_element in self.html.xpath(paths):
            string = date_element.xpath("./text()")
            try:
                string = string[0]
                # handle examples where time but no date (ga_example_3.html)
                if ":" in string and ("AM" in string or "PM" in string):
                    continue
                # handle legacy example (ga_example.html)
                string = string.split("SUMMARIES")[0]
                date_string = re.sub(r"\W+", " ", string)
                # handle legacy example (ga_example.html)
                if len(date_string.split()) != 3:
                    continue
            except:
                continue
            parent = date_element.xpath("./..")[0]
            # handle legacy example (ga_example.html)
            while parent.tag != "p":
                parent = parent.xpath("./..")[0]
            for item in parent.getnext().xpath("./li"):
                text = item.text_content()
                if text:
                    # Extract Docket numbers
                    dockets = re.findall(self.regex_docket, text)
                    if not dockets:
                        raise InsanityException(
                            "Could not find docket numbers in: 's'" % text)

                    # Extract name substring; I am sure this could
                    # be done with a more slick regex, but its not
                    # my forte...
                    name = text
                    for docket in dockets:
                        name = name.replace(docket, "")
                    name = name.lstrip(" .,")

                    self.cases.append({
                        "date": date_string,
                        "docket": ", ".join(dockets),
                        "name": titlecase(name.lstrip(" .,")),
                        "url": item.xpath(".//a[1]/@href")[0],
                    })
Пример #21
0
 def get_absolute_opinion_path(self, suffix, type):
     """Determine the absolute path given the file suffix in the
     json object and the opinion type.  This is necessary because
     the course does not return standardized data objects.
     """
     type_parts = type.lower().split("_")
     type_parts_length = len(type_parts)
     if type_parts_length == 1:
         status = False
         type = type_parts[0].lower()
     elif type_parts_length == 2:
         status = type_parts[0].lower()
         type = type_parts[1].lower()
     else:
         raise InsanityException(
             'Unrecognized type "%s", this should never '
             "happen" % type)
     if not suffix.startswith(type):
         if status:
             suffix = "%s/%s/%s" % (type, status, suffix)
         else:
             suffix = "%s/%s" % (type, suffix)
     return "%s/opinions/%s" % (self.base_url, suffix)
Пример #22
0
 def return_url_path(self):
     if "/option" in self.opinion_path:
         return "%s/@value" % self.opinion_path
     elif "/li/a" in self.opinion_path:
         return "%s/@href" % self.opinion_path
     raise InsanityException("No recognized path to url")
Пример #23
0
    def _check_sanity(self):
        """Check that the objects attributes make sense:
            1. Do all the attributes have the same length?
            1. Do we have any content at all?
            1. Is there a bare minimum of meta data?
            1. Are the dates datetime objects, not strings?
            1. Are any dates from the 22nd century? (01-01-2104)
            1. Are case_names more than just empty whitespace?
            1. Has the `cookies` attribute been normalized to a dict?
            1. ?

        The signature of this method is subject to change as additional checks
        become convenient.

        Inheriting classes should override this method calling super to give it
        the necessary parameters.

        If sanity is OK, no return value. If not, throw InsanityException or
        warnings, as appropriate.
        """
        lengths = {}
        for attr in self._all_attrs:
            if self.__getattribute__(attr) is not None:
                lengths[attr] = len(self.__getattribute__(attr))
        values = list(lengths.values())
        if values.count(values[0]) != len(values):
            # Are all elements equal?
            raise InsanityException(
                "%s: Scraped meta data fields have differing"
                " lengths: %s" % (self.court_id, lengths)
            )
        if len(self.case_names) == 0:
            logger.warning("%s: Returned with zero items." % self.court_id)
        else:
            for field in self._req_attrs:
                if self.__getattribute__(field) is None:
                    raise InsanityException(
                        "%s: Required fields do not contain any data: %s"
                        % (self.court_id, field)
                    )
            i = 0
            prior_case_name = None
            for name in self.case_names:
                if not name.strip():
                    raise InsanityException(
                        "Item with index %s has an empty case name. The prior "
                        "item had case name of: %s" % (i, prior_case_name)
                    )
                prior_case_name = name
                i += 1

        for index, case_date in enumerate(self.case_dates):
            if not isinstance(case_date, date):
                raise InsanityException(
                    "%s: member of case_dates list not a valid date object. "
                    "Instead it is: %s with value: %s"
                    % (self.court_id, type(case_date), case_date)
                )
            # Sanitize case date, fix typo of current year if present
            fixed_date = fix_future_year_typo(case_date)
            if fixed_date != case_date:
                logger.info(
                    "Date year typo detected. Converting %s to %s "
                    "for case '%s' in %s"
                    % (
                        case_date,
                        fixed_date,
                        self.case_names[index],
                        self.court_id,
                    )
                )
                case_date = fixed_date
                self.case_dates[index] = fixed_date
            if case_date.year > 2025:
                raise InsanityException(
                    "%s: member of case_dates list is from way in the future, "
                    "with value %s" % (self.court_id, case_date.year)
                )

        # Is cookies a dict?
        if type(self.cookies) != dict:
            raise InsanityException(
                "self.cookies not set to be a dict by " "scraper."
            )
        logger.info(
            "%s: Successfully found %s items."
            % (self.court_id, len(self.case_names))
        )
Пример #24
0
 def return_url_path(self):
     if '/option' in self.opinion_path:
         return '%s/@value' % self.opinion_path
     elif '/li/a' in self.opinion_path:
         return '%s/@href' % self.opinion_path
     raise InsanityException('No recognized path to url')