Python get_html_parsed_text示例

编程语言: Python

命名空间/包名称: juriscraper.lib.html_utils

方法/功能: get_html_parsed_text

hotexamples.com的示例: 8

Python get_html_parsed_text - 已找到8个示例。这些是从开源项目中提取的最受好评的juriscraper.lib.html_utils.get_html_parsed_text现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： free_documents.py 项目： cgruppioni/juriscraper

 def parse(responses):
     """Using a list of responses, parse out useful information and return
     it as a list of dicts.
     """
     results = []
     court_id = "Court not yet set."
     for response in responses:
         response.raise_for_status()
         court_id = get_court_id_from_url(response.url)
         set_response_encoding(response)
         text = clean_html(response.text)
         tree = get_html_parsed_text(text)
         tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url)
         opinion_count = int(
             tree.xpath('//b[contains(text(), "Total number of '
                        'opinions reported")]')[0].tail)
         if opinion_count == 0:
             continue
         rows = tree.xpath('(//table)[1]//tr[position() > 1]')
         for row in rows:
             if results:
                 # If we have results already, pass the previous result to
                 # the FreeOpinionRow object.
                 row = FreeOpinionRow(row, results[-1], court_id)
             else:
                 row = FreeOpinionRow(row, {}, court_id)
             results.append(row)
     logger.info("Parsed %s results from written opinions report at %s" %
                 (len(results), court_id))
     return results

示例#2

显示文件

文件： test_pacer.py 项目： janderse/juriscraper

    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1  # No header row

示例#3

显示文件

    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1  # No header row

示例#4

显示文件

    def _make_html_tree(self, text):
        """Hook for custom HTML parsers

        By default, the etree.html parser is used, but this allows support for
        other parsers like the html5parser or even BeautifulSoup, if it's called
        for (example: return get_html5_parsed_text(text)). Otherwise, this method
        can be overwritten to execute custom parsing logic.
        """
        return get_html_parsed_text(text)

示例#5

显示文件

文件： AbstractSite.py 项目： freelawproject/juriscraper

    def _make_html_tree(self, text):
        """Hook for custom HTML parsers

        By default, the etree.html parser is used, but this allows support for
        other parsers like the html5parser or even BeautifulSoup, if it's called
        for (example: return get_html5_parsed_text(text)). Otherwise, this method
        can be overwritten to execute custom parsing logic.
        """
        return get_html_parsed_text(text)

示例#6

显示文件

文件： free_documents.py 项目： cgruppioni/juriscraper

    def download_pdf(self, pacer_case_id, pacer_document_number):
        """Download a PDF from PACER.

        Note that this doesn't support attachments yet.
        """
        timeout = (60, 300)
        url = make_doc1_url(self.court_id, pacer_document_number, True)
        data = {
            'caseid': pacer_case_id,
            'got_receipt': '1',
        }

        logger.info("GETting PDF at URL: %s with params: %s" % (url, data))
        r = self.session.get(url, params=data, timeout=timeout)

        # The request above sometimes generates an HTML page with an iframe
        # containing the PDF, and other times returns the PDF. Our task is thus
        # to either get the src of the iframe and download the PDF or just
        # return the pdf.
        r.raise_for_status()
        if is_pdf(r):
            logger.info('Got PDF binary data for case %s at: %s' % (url, data))
            return r

        text = clean_html(r.text)
        tree = get_html_parsed_text(text)
        tree.rewrite_links(fix_links_in_lxml_tree, base_href=r.url)
        try:
            iframe_src = tree.xpath('//iframe/@src')[0]
        except IndexError:
            if 'pdf:Producer' in text:
                logger.error("Unable to download PDF. PDF content was placed "
                             "directly in HTML. URL: %s, caseid: %s" %
                             (url, pacer_case_id))
            else:
                logger.error(
                    "Unable to download PDF. PDF not served as binary "
                    "data and unable to find iframe src attribute. "
                    "URL: %s, caseid: %s" % (url, pacer_case_id))
            return None

        r = self.session.get(iframe_src, timeout=timeout)
        if is_pdf(r):
            logger.info('Got iframed PDF data for case %s at: %s' %
                        (url, iframe_src))

        return r

示例#7

显示文件

 def _get_subpage_html_by_page(self, page):
     path = ".//textarea[@id='PostContent']"
     html = page.xpath(path)[0].text_content()
     return get_html_parsed_text(html)

示例#8

显示文件

 def get_page(self) -> WebElement:
     text = clean_html(self.webdriver.page_source)
     html = get_html_parsed_text(text)
     html.rewrite_links(fix_links_but_keep_anchors, base_href=self.url)
     return html