Python Blockifier示例，extractnet.Blockifier Python示例

示例#1

0

显示文件

 def test_small_doc(self):
     kf = KohlschuetterFeatures()
     s = "<html></html>"
     with pytest.raises(ValueError):
         kf.transform(Blockifier.blockify(s))
     s = "<html> <p>a</p> <div>b</div> </html>"
     with pytest.raises(ValueError):
         kf.transform(Blockifier.blockify(s))

示例#2

0

显示文件

    def test_lxml_error(self):
        """tests the case where lxml raises an error during parsing

        also handles case where lxml returns None for the tree"""
        # this raises an error in parsing
        with pytest.raises(BlockifyError):
            Blockifier.blockify("")
        # this returns None in lxml
        assert etree.fromstring("<!--", etree.HTMLParser(recover=True)) is None
        with pytest.raises(BlockifyError):
            Blockifier.blockify("<!--")

示例#3

0

显示文件

 def test_big_html(self, html):
     s = html
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['Inside', 'the', 'h1', 'tag'],
          ['First', 'line', 'of', 'the', 'content', 'in', 'bold'],
          ['A', 'paragraph', 'with', 'a', 'link', 'and', 'some', 'additional', 'words.'],
          ['Second', 'paragraph', 'Insert', 'a', 'block', 'quote', 'here'],
          ['Some', 'more', 'text', 'after', 'the', 'image'],
          ['An', 'h2', 'tag', 'just', 'for', 'kicks'],
          ['Finally', 'more', 'text', 'at', 'the', 'end', 'of', 'the', 'content'],
          ['This', 'is', 'a', 'comment'],
          ['with', 'two', 'paragraphs', 'and', 'some', 'comment', 'spam'],
          ['Second', 'comment'],
          ['Footer', 'text']]
     )
     link_output_tokens(
         blocks,
         [[],
          [],
          ['a', 'link'],
          [],
          [],
          [],
          [],
          [],
          ['and', 'some', 'comment', 'spam'],
          [],
          []]
     )
     css_output_tokens(
         blocks, 'class',
         [[''],
          ['title'],
          ['link'],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          ['footer']]
     )
     css_output_tokens(
         blocks, 'id',
         [[''],
          ['content'],
          ['para'],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          ['']]
     )

示例#4

0

显示文件

 def test_empty_blocks(self):
     s = """<div> .! </div>
             some text
            <h1> in an h1 </h1>
            <p> ! _ </p>
         """
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks, [['.!', 'some', 'text'], ['in', 'an', 'h1']])

示例#5

0

显示文件

 def test_very_simple(self):
     """test_very_simple"""
     s = """<div>some text
                 <script> skip this </script>
                 more text here
            </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks, [['some', 'text', 'more', 'text', 'here']])

示例#6

0

显示文件

 def test_callback(self):
     s = """<div>some text <i>in italic</i> and something else
                 <pre> <div>skip this</div> </pre>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = Blockifier.blockify(s, parse_callback=self.count_divs)
     print(TestBlockifier.div_count)
     assert self.div_count == 2

示例#7

0

显示文件

 def test_very_simple2(self):
     s = """<div>some text <i>in italic</i> and something else
                 <script> <div>skip this</div> </script>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['some', 'text', 'in', 'italic', 'and', 'something', 'else', 'bold', 'stuff', 'after', 'the', 'script']]
     )

示例#8

0

显示文件

 def test_simple_two_blocks(self):
     s = """<h1>A title <i>with italics</i> and other words</h1>
            some text outside the h1
            <div>a div <span class="test"> with a span </span> more </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['A', 'title', 'with', 'italics', 'and', 'other', 'words', 'some', 'text', 'outside', 'the', 'h1'],
          ['a', 'div', 'with', 'a', 'span', 'more']]
     )

示例#9

0

显示文件

 def test_class_id_unicode(self):
     s = b"""<div CLASS=' class1 \xc2\xae'>text in div
             <h1 id="HEADER">header</h1>
             </div>"""
     blocks = Blockifier.blockify(s, encoding='utf-8')
     block_output_tokens(
         blocks, [['text', 'in', 'div'], ['header']])
     css_output_tokens(
         blocks, 'id', [[''], ['header']])
     css_output_tokens(
         blocks, 'class', [['class1', str_cast(b'\xc2\xae')], ['']])

示例#10

0

显示文件

 def test_all_non_english(self):
     s = u"""<div> <div> \u03b4\u03bf\u03b3 </div> <div> <a href="summer">\xe9t\xe9</a> </div>
      <div> \u62a5\u9053\u4e00\u51fa </div> </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [[u'\u03b4\u03bf\u03b3'],
          [u'\xe9t\xe9'],
          [u'\u62a5\u9053\u4e00\u51fa']]
     )
     link_output_tokens(blocks, [[], [u'\xe9t\xe9'], []])

示例#11

0

显示文件

 def test_class_id(self):
     s = """<div CLASS='d1'>text in div
             <h1 id="HEADER">header</h1>
             <div class="nested">dragnet</div>
             </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks, [['text', 'in', 'div'], ['header'], ['dragnet']])
     css_output_tokens(
         blocks, 'id', [[''], ['header'], ['']])
     css_output_tokens(
         blocks, 'class', [['d1'], [''], ['nested']])

示例#12

0

显示文件

 def test_comment(self):
     s = """<H1>h1 tag word</H1>
            <!-- a comment -->
            orphaned text
            <TABLE><tr><td>table data</td></tr><tr><td>second row</td></tr></TABLE>
            final
            """
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['h1', 'tag', 'word', 'orphaned', 'text'],
          ['table', 'data', 'second', 'row', 'final']]
     )

示例#13

0

显示文件

    def test_transform(self):
        kf = KohlschuetterFeatures()
        s = "<html> <p>first </p> <div> <p>second block with <a href=''>anchor</a> </p> <p>the third block</p> </div> </html>"
        blocks = Blockifier.blockify(s)
        features = kf.transform(blocks)
        block_output_tokens(blocks, [["first"], ["second", "block", "with", "anchor"], ["the", "third", "block"]])
        link_output_tokens(blocks, [[], ["anchor"], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        assert np.allclose(features[0, :], [0.0, 0.0, link_density[0], text_density[0], link_density[1], text_density[1]])
        assert np.allclose(features[1, :], [link_density[0], text_density[0], link_density[1], text_density[1], link_density[2], text_density[2]])
        assert np.allclose(features[2, :], [link_density[1], text_density[1], link_density[2], text_density[2], 0.0, 0.0])

示例#14

0

显示文件

 def test_nested_blocks(self):
     s = """initial text
         <div>div <p> with paragraph </p>
         after Paragraph
         <div> nested div <div> and again </div>here</div>
         </div>
         final
         <div> <i> italic </i> before <h1>tag</h1></div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['initial', 'text'],
          ['div'],
          ['with', 'paragraph', 'after', 'Paragraph'],
          ['nested', 'div'],
          ['and', 'again', 'here', 'final'],
          ['italic', 'before'],
          ['tag']]
     )

示例#15

0

显示文件

 def test_anchors(self):
     s = """<a href=".">anchor text</a>
            more
            <div>text <a href=".">123</a><div>MORE!</div></div>
            an img link<a href="."><img src="."></a>there
            <table><tr><td><a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a></tr></td></table>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['anchor', 'text', 'more'],
          ['text', '123'],
          ['MORE!', 'an', 'img', 'link', 'there'],
          ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']]
     )
     link_output_tokens(
         blocks,
         [['anchor', 'text'],
          ['123'],
          [],
          ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']]
     )

示例#16

0

显示文件

 def test_invalid_bytes(self):
     # \x80 is invalid utf-8
     s = b"""<div CLASS='\x80'>text in div</div><p>invalid bytes \x80</p>"""
     blocks = Blockifier.blockify(s, encoding='utf-8')
     block_output_tokens(blocks, [['text', 'in', 'div']])
     css_output_tokens(blocks, 'class', [[str_cast(b'\xc2\x80')]])

示例#17

0

显示文件

 def test_unicode(self):
     s = u"""<div><div><a href="."> the registered trademark \xae</a></div></div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks, [['the', 'registered', 'trademark', u'\xae']])
     link_output_tokens(blocks, [['the', 'registered', 'trademark', u'\xae']])