Пример #1
0
 def parse(responses):
     """Using a list of responses, parse out useful information and return
     it as a list of dicts.
     """
     results = []
     court_id = "Court not yet set."
     for response in responses:
         response.raise_for_status()
         court_id = get_court_id_from_url(response.url)
         set_response_encoding(response)
         text = clean_html(response.text)
         tree = get_html_parsed_text(text)
         tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url)
         opinion_count = int(
             tree.xpath('//b[contains(text(), "Total number of '
                        'opinions reported")]')[0].tail)
         if opinion_count == 0:
             continue
         rows = tree.xpath('(//table)[1]//tr[position() > 1]')
         for row in rows:
             if results:
                 # If we have results already, pass the previous result to
                 # the FreeOpinionRow object.
                 row = FreeOpinionRow(row, results[-1], court_id)
             else:
                 row = FreeOpinionRow(row, {}, court_id)
             results.append(row)
     logger.info("Parsed %s results from written opinions report at %s" %
                 (len(results), court_id))
     return results
Пример #2
0
    def download_pdf(self, pacer_case_id, pacer_document_number):
        """Download a PDF from PACER.

        Note that this doesn't support attachments yet.
        """
        timeout = (60, 300)
        url = make_doc1_url(self.court_id, pacer_document_number, True)
        data = {
            'caseid': pacer_case_id,
            'got_receipt': '1',
        }

        logger.info("GETting PDF at URL: %s with params: %s" % (url, data))
        r = self.session.get(url, params=data, timeout=timeout)

        # The request above sometimes generates an HTML page with an iframe
        # containing the PDF, and other times returns the PDF. Our task is thus
        # to either get the src of the iframe and download the PDF or just
        # return the pdf.
        r.raise_for_status()
        if is_pdf(r):
            logger.info('Got PDF binary data for case %s at: %s' % (url, data))
            return r

        text = clean_html(r.text)
        tree = get_html_parsed_text(text)
        tree.rewrite_links(fix_links_in_lxml_tree, base_href=r.url)
        try:
            iframe_src = tree.xpath('//iframe/@src')[0]
        except IndexError:
            if 'pdf:Producer' in text:
                logger.error("Unable to download PDF. PDF content was placed "
                             "directly in HTML. URL: %s, caseid: %s" %
                             (url, pacer_case_id))
            else:
                logger.error(
                    "Unable to download PDF. PDF not served as binary "
                    "data and unable to find iframe src attribute. "
                    "URL: %s, caseid: %s" % (url, pacer_case_id))
            return None

        r = self.session.get(iframe_src, timeout=timeout)
        if is_pdf(r):
            logger.info('Got iframed PDF data for case %s at: %s' %
                        (url, iframe_src))

        return r
Пример #3
0
 def get_page(self) -> WebElement:
     text = clean_html(self.webdriver.page_source)
     html = get_html_parsed_text(text)
     html.rewrite_links(fix_links_but_keep_anchors, base_href=self.url)
     return html
Пример #4
0
 def _clean_text(self, text):
     """A hook for subclasses to override if needed."""
     return clean_html(text)
Пример #5
0
 def _clean_text(self, text):
     """A hook for subclasses to override if needed."""
     return clean_html(text)