Exemplo n.º 1
0
    def _process_html(self):
        path = '//a[contains(./@href, ".pdf")]'
        for link in self.html.xpath(path):
            case = {}
            if len(link.text.strip()) == 0:
                # There seems to be some cases that are pdf-linked more than once, with the second link blank
                continue
            # text of case info apart from name
            utext = link.getparent().text_content()[len(link.text):]
            text = utext.encode("ascii", "replace")
            # date
            date = ""
            date_match = re.search(
                "(\([a-zA-Z]+[^\w]{0,3}\d\d?. \d{4}\))|(\(\d{4}\))", utext)
            if date_match is not None:
                date = date_match.group(0).strip("()")
            else:
                raise ParsingException("Date regexp failed on text" + text)
            # precedent
            header = link.getparent().getparent().getprevious().text
            precedential = 'Unpublished'
            if 'Precedential' in header:
                precedential = 'Published'
            # docket numbers
            docket_match = re.search(
                "(\d{2,4}-\d{4,6})|(\d{3},[0-9|A-Z]{3})|(\dx{2},x{3})", utext)
            docket = ""
            paper_match = re.search("Paper \d+", utext)
            if docket_match is not None:
                docket = docket_match.group(0)
                if paper_match is not None:
                    docket = "{} ({})".format(docket, paper_match.group(0))
            else:
                raise ParsingException("Docket No. Regexp failed on text" +
                                       text)
            # summary
            # use unicode, there are a lot of section codes
            summary_match = re.search("\[.*\]", utext)
            summary = ""
            if summary_match is not None:
                summary = summary_match.group(0).strip("[]")
            else:
                ParsingException(
                    "Summary regexp failed on text {}".format(text))

            # We sometimes have repeats on this page, so dict
            self.cases[docket] = {
                "name": link.text,
                "url": link.get('href'),
                "date": convert_date_string(date),
                "precedential": precedential,
                "docket": docket,
                "summary": summary
            }
Exemplo n.º 2
0
def fetch_docket_by_pacer_case_id(
    session, court_id, pacer_case_id, fq,
):
    """Download the docket from PACER and merge it into CL

    :param session: A PacerSession object to work with
    :param court_id: The CL ID of the court
    :param pacer_case_id: The pacer_case_id of the docket, if known
    :param fq: The PacerFetchQueue object
    :return: a dict with information about the docket and the new data
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    report.query(pacer_case_id, **get_fq_docket_kwargs(fq))

    docket_data = report.data
    if not docket_data:
        raise ParsingException("No data found in docket report.")
    if fq.docket_id:
        d = Docket.objects.get(pk=fq.docket_id)
    else:
        d, count = find_docket_object(
            court_id, pacer_case_id, docket_data["docket_number"]
        )
        if count > 1:
            d = d.earliest("date_created")
    rds_created, content_updated = merge_pacer_docket_into_cl_docket(
        d, pacer_case_id, docket_data, report, appellate=False,
    )
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Exemplo n.º 3
0
 def get_int_from_details(self, node):
     s = self.case_details.xpath('%s/text()' % node)[0].strip()
     try:
         return int(s)
     except ValueError:
         # Can't parse string to int
         logger.debug("Couldn't get int for node %s" % node)
         raise ParsingException("Cannot extract int for node %s" % node)
Exemplo n.º 4
0
 def get_court(self):
     """Extract the court from the XML and return it as a Court object"""
     court_str = self.case_details.xpath('court/text()')[0].strip()
     try:
         c = Court.objects.get(pk=map_pacer_to_cl_id(court_str))
     except Court.DoesNotExist:
         raise ParsingException("Unable to identify court: %s" % court_str)
     else:
         return c