示例#1
0
 def test_small_doc(self):
     kf = KohlschuetterFeatures()
     s = "<html></html>"
     with pytest.raises(ValueError):
         kf.transform(Blockifier.blockify(s))
     s = "<html> <p>a</p> <div>b</div> </html>"
     with pytest.raises(ValueError):
         kf.transform(Blockifier.blockify(s))
示例#2
0
    def test_small_doc(self):
        kf = KohlschuetterFeatures()

        s = '<html></html>'
        with self.assertRaises(ValueError):
            kf.transform(Blockifier.blockify(s))

        s = '<html> <p>a</p> <div>b</div> </html>'
        with self.assertRaises(ValueError):
            kf.transform(Blockifier.blockify(s))
示例#3
0
    def test_lxml_error(self):
        """tests the case where lxml raises an error during parsing

        also handles case where lxml returns None for the tree"""
        # this raises an error in parsing
        with pytest.raises(BlockifyError):
            Blockifier.blockify("")
        # this returns None in lxml
        assert etree.fromstring("<!--", etree.HTMLParser(recover=True)) is None
        with pytest.raises(BlockifyError):
            Blockifier.blockify("<!--")
示例#4
0
 def test_callback(self):
     s = """<div>some text <i>in italic</i> and something else
                 <pre> <div>skip this</div> </pre>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = Blockifier.blockify(s, parse_callback=self.count_divs)
     assert self.div_count == 2
示例#5
0
    def test_transform(self):
        kf = KohlschuetterFeatures()
        s = "<html> <p>first </p> <div> <p>second block with <a href=''>anchor</a> </p> <p>the third block</p> </div> </html>"
        blocks = Blockifier.blockify(s)
        features = kf.transform(blocks)
        block_output_tokens(blocks,
                            [["first"], ["second", "block", "with", "anchor"],
                             ["the", "third", "block"]])
        link_output_tokens(blocks, [[], ["anchor"], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        assert np.allclose(features[0, :], [
            0.0, 0.0, link_density[0], text_density[0], link_density[1],
            text_density[1]
        ])
        assert np.allclose(features[1, :], [
            link_density[0], text_density[0], link_density[1], text_density[1],
            link_density[2], text_density[2]
        ])
        assert np.allclose(features[2, :], [
            link_density[1], text_density[1], link_density[2], text_density[2],
            0.0, 0.0
        ])
示例#6
0
 def test_all_non_english(self):
     s = u"""<div> <div> \u03b4\u03bf\u03b3 </div> <div> <a href="summer">\xe9t\xe9</a> </div>
      <div> \u62a5\u9053\u4e00\u51fa </div> </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks, [[u'\u03b4\u03bf\u03b3'], [u'\xe9t\xe9'],
                                  [u'\u62a5\u9053\u4e00\u51fa']])
     link_output_tokens(blocks, [[], [u'\xe9t\xe9'], []])
示例#7
0
 def test_big_html(self, html):
     s = html
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['Inside', 'the', 'h1', 'tag'],
          ['First', 'line', 'of', 'the', 'content', 'in', 'bold'],
          [
              'A', 'paragraph', 'with', 'a', 'link', 'and', 'some',
              'additional', 'words.'
          ], [
              'Second', 'paragraph', 'Insert', 'a', 'block', 'quote', 'here'
          ], ['Some', 'more', 'text', 'after', 'the', 'image'],
          ['An', 'h2', 'tag', 'just', 'for', 'kicks'],
          [
              'Finally', 'more', 'text', 'at', 'the', 'end', 'of', 'the',
              'content'
          ], ['This', 'is', 'a', 'comment'],
          ['with', 'two', 'paragraphs', 'and', 'some', 'comment', 'spam'],
          ['Second', 'comment'], ['Footer', 'text']])
     link_output_tokens(blocks,
                        [[], [], ['a', 'link'], [], [], [], [], [],
                         ['and', 'some', 'comment', 'spam'], [], []])
     css_output_tokens(blocks, 'class',
                       [[''], ['title'], ['link'], [''], [''], [''], [''],
                        [''], [''], [''], ['footer']])
     css_output_tokens(blocks, 'id',
                       [[''], ['content'], ['para'], [''], [''], [''], [''],
                        [''], [''], [''], ['']])
示例#8
0
 def test_very_simple2(self):
     s = """<div>some text <i>in italic</i> and something else
                 <script> <div>skip this</div> </script>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks, [['some', 'text', 'in', 'italic', 'and', 'something', 'else', 'bold', 'stuff', 'after', 'the', 'script']])
示例#9
0
 def test_unicode(self):
     s = u"""<div><div><a href="."> the registered trademark \xae</a></div></div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks,
                         [['the', 'registered', 'trademark', u'\xae']])
     link_output_tokens(blocks,
                        [['the', 'registered', 'trademark', u'\xae']])
示例#10
0
    def test_transform(self):
        kf = KohlschuetterFeatures()
        s = '<html> <p>first </p> <div> <p>second block with <a href=' '>anchor</a> </p> <p>the third block</p> </div> </html>'
        blocks = Blockifier.blockify(s)
        features = kf.transform(blocks)
        self.block_output_tokens(
            blocks, [['first'], ['second', 'block', 'with', 'anchor'],
                     ['the', 'third', 'block']])
        self.link_output_tokens(blocks, [[], ['anchor'], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        self.assertTrue(
            np.allclose(features[0, :], [
                0.0, 0.0, link_density[0], text_density[0], link_density[1],
                text_density[1]
            ]))
        self.assertTrue(
            np.allclose(features[1, :], [
                link_density[0], text_density[0], link_density[1],
                text_density[1], link_density[2], text_density[2]
            ]))
        self.assertTrue(
            np.allclose(features[2, :], [
                link_density[1], text_density[1], link_density[2],
                text_density[2], 0.0, 0.0
            ]))
示例#11
0
 def test_unicode(self):
     s = u"""<div><div><a href="."> the registered trademark \xae</a></div></div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks,
         [['the', 'registered', 'trademark', u'\xae'.encode('utf-8')]])
     self.link_output_tokens(blocks,
         [['the', 'registered', 'trademark', u'\xae'.encode('utf-8')]])
示例#12
0
 def test_big_html(self):
     s = big_html_doc
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(
         blocks,
         [['Inside', 'the', 'h1', 'tag'],
          ['First', 'line', 'of', 'the', 'content', 'in', 'bold'],
          ['A', 'paragraph', 'with', 'a', 'link', 'and', 'some', 'additional', 'words.'],
          ['Second', 'paragraph', 'Insert', 'a', 'block', 'quote', 'here'],
          ['Some', 'more', 'text', 'after', 'the', 'image'],
          ['An', 'h2', 'tag', 'just', 'for', 'kicks'],
          ['Finally', 'more', 'text', 'at', 'the', 'end', 'of', 'the', 'content'],
          ['This', 'is', 'a', 'comment'],
          ['with', 'two', 'paragraphs', 'and', 'some', 'comment', 'spam'],
          ['Second', 'comment'],
          ['Footer', 'text']]
         )
     self.link_output_tokens(
         blocks,
         [[],
          [],
          ['a', 'link'],
          [],
          [],
          [],
          [],
          [],
          ['and', 'some', 'comment', 'spam'],
          [],
          []]
         )
     self.css_output_tokens(
         blocks, 'class',
         [[''],
          ['title'],
          ['link'],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          ['footer']]
         )
     self.css_output_tokens(
         blocks, 'id',
         [[''],
          ['content'],
          ['para'],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          [''],
          ['']]
         )
示例#13
0
 def test_simple_two_blocks(self):
     s = """<h1>A title <i>with italics</i> and other words</h1>
            some text outside the h1
            <div>a div <span class="test"> with a span </span> more </div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks,
            [['A', 'title', 'with', 'italics', 'and', 'other', 'words', 'some', 'text', 'outside', 'the', 'h1'],
             ['a', 'div', 'with', 'a', 'span', 'more']])
示例#14
0
 def test_very_simple(self):
     """test_very_simple"""
     s = """<div>some text
                 <script> skip this </script>
                 more text here
            </div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks, [['some', 'text', 'more', 'text', 'here']])
示例#15
0
 def test_callback(self):
     s = """<div>some text <i>in italic</i> and something else
                 <pre> <div>skip this</div> </pre>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = Blockifier.blockify(
         s, parse_callback=TestBlockifier.count_divs)
     self.assertEqual(TestBlockifier.div_count, 2)
示例#16
0
 def test_class_id_unicode(self):
     s = b"""<div CLASS=' class1 \xc2\xae'>text in div
             <h1 id="HEADER">header</h1>
             </div>"""
     blocks = Blockifier.blockify(s, encoding='utf-8')
     block_output_tokens(blocks, [['text', 'in', 'div'], ['header']])
     css_output_tokens(blocks, 'id', [[''], ['header']])
     css_output_tokens(blocks, 'class',
                       [['class1', str_cast(b'\xc2\xae')], ['']])
示例#17
0
 def test_simple_two_blocks(self):
     s = """<h1>A title <i>with italics</i> and other words</h1>
            some text outside the h1
            <div>a div <span class="test"> with a span </span> more </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks, [[
         'A', 'title', 'with', 'italics', 'and', 'other', 'words', 'some',
         'text', 'outside', 'the', 'h1'
     ], ['a', 'div', 'with', 'a', 'span', 'more']])
示例#18
0
 def test_empty_blocks(self):
     s = """<div> .! </div>
             some text
            <h1> in an h1 </h1>
            <p> ! _ </p>
         """
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(
         blocks, [['.!', 'some', 'text'], ['in', 'an', 'h1']])
示例#19
0
 def test_class_id(self):
     s = """<div CLASS='d1'>text in div
             <h1 id="HEADER">header</h1>
             <div class="nested">dragnet</div>
             </div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks,
                         [['text', 'in', 'div'], ['header'], ['dragnet']])
     css_output_tokens(blocks, 'id', [[''], ['header'], ['']])
     css_output_tokens(blocks, 'class', [['d1'], [''], ['nested']])
示例#20
0
 def test_comment(self):
     s = """<H1>h1 tag word</H1>
            <!-- a comment -->
            orphaned text
            <TABLE><tr><td>table data</td></tr><tr><td>second row</td></tr></TABLE>
            final
            """
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks,
                         [['h1', 'tag', 'word', 'orphaned', 'text'],
                          ['table', 'data', 'second', 'row', 'final']])
示例#21
0
 def test_class_id_unicode(self):
     s = """<div CLASS=' class1 \xc2\xae'>text in div
             <h1 id="HEADER">header</h1>
             </div>"""
     blocks = Blockifier.blockify(s, encoding='utf-8')
     self.block_output_tokens(
         blocks, [['text', 'in', 'div'], ['header']])
     self.css_output_tokens(
         blocks, 'id', [[''], ['header']])
     self.css_output_tokens(
         blocks, 'class', [['class1', '\xc2\xae'], ['']])
示例#22
0
 def test_comment(self):
     s = """<H1>h1 tag word</H1>
            <!-- a comment -->
            orphaned text
            <TABLE><tr><td>table data</td></tr><tr><td>second row</td></tr></TABLE>
            final
            """
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks,
             [['h1', 'tag', 'word', 'orphaned', 'text'],
              ['table', 'data', 'second', 'row', 'final']])
示例#23
0
 def test_class_id(self):
     s = """<div CLASS='d1'>text in div
             <h1 id="HEADER">header</h1>
             <div class="nested">dragnet</div>
             </div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(
         blocks, [['text', 'in', 'div'], ['header'], ['dragnet']])
     self.css_output_tokens(
         blocks, 'id', [[''], ['header'], ['']])
     self.css_output_tokens(
         blocks, 'class', [['d1'], [''], ['nested']])
示例#24
0
 def test_all_non_english(self):
     s = u"""<div> <div> \u03b4\u03bf\u03b3 </div> <div> <a href="summer">\xe9t\xe9</a> </div>
      <div> \u62a5\u9053\u4e00\u51fa </div> </div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks,
         [[u'\u03b4\u03bf\u03b3'.encode('utf-8')],
         [u'\xe9t\xe9'.encode('utf-8')],
         [u'\u62a5\u9053\u4e00\u51fa'.encode('utf-8')]])
     self.link_output_tokens(blocks,
         [[],
          [u'\xe9t\xe9'.encode('utf-8')],
          []])
示例#25
0
    def test_arias_model(self):
        cutoff_percent = 60
        window = 2

        a = Arias(cutoff_percent, window)
        content_arias = a.analyze(big_html_doc)

        # now compute the actual content
        blocks = Blockifier.blockify(big_html_doc)
        actual_content_indices = [1, 2, 3]
        actual_content = ' '.join([blocks[k].text for k in actual_content_indices])

        self.assertEqual(actual_content, content_arias)
示例#26
0
    def test_arias_model(self):
        cutoff_percent = 60
        window = 2

        a = Arias(cutoff_percent, window)
        content_arias = a.analyze(big_html_doc)

        # now compute the actual content
        blocks = Blockifier.blockify(big_html_doc)
        actual_content_indices = [1, 2, 3]
        actual_content = ' '.join(
            [blocks[k].text for k in actual_content_indices])

        self.assertEqual(actual_content, content_arias)
示例#27
0
 def test_nested_blocks(self):
     s = """initial text
         <div>div <p> with paragraph </p>
         after Paragraph
         <div> nested div <div> and again </div>here</div>
         </div>
         final
         <div> <i> italic </i> before <h1>tag</h1></div>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(
         blocks,
         [['initial', 'text'], ['div'],
          ['with', 'paragraph', 'after', 'Paragraph'], ['nested', 'div'],
          ['and', 'again', 'here', 'final'], ['italic', 'before'], ['tag']])
示例#28
0
 def test_anchors(self):
     s = """<a href=".">anchor text</a>
            more
            <div>text <a href=".">123</a><div>MORE!</div></div>
            an img link<a href="."><img src="."></a>there
            <table><tr><td><a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a></tr></td></table>"""
     blocks = Blockifier.blockify(s)
     block_output_tokens(blocks,
                         [['anchor', 'text', 'more'], ['text', '123'],
                          ['MORE!', 'an', 'img', 'link', 'there'],
                          ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])
     link_output_tokens(blocks,
                        [['anchor', 'text'], ['123'], [],
                         ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])
示例#29
0
 def test_nested_blocks(self):
     s = """initial text
         <div>div <p> with paragraph </p>
         after Paragraph
         <div> nested div <div> and again </div>here</div>
         </div>
         final
         <div> <i> italic </i> before <h1>tag</h1></div>"""
     blocks = Blockifier.blockify(s)
     self.block_output_tokens(blocks,
             [['initial', 'text'],
             ['div'],
             ['with', 'paragraph', 'after', 'Paragraph'],
             ['nested', 'div'],
             ['and', 'again', 'here', 'final'],
             ['italic', 'before'],
             ['tag']])
示例#30
0
    def test_anchors(self):
        s = """<a href=".">anchor text</a>
               more
               <div>text <a href=".">123</a><div>MORE!</div></div>
               an img link<a href="."><img src="."></a>there
               <table><tr><td><a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a></tr></td></table>"""
        blocks = Blockifier.blockify(s)

        self.block_output_tokens(blocks,
              [['anchor', 'text', 'more'],
              ['text', '123'],
              ['MORE!', 'an', 'img', 'link', 'there'],
              ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])

        self.link_output_tokens(blocks,
            [['anchor', 'text'],
             ['123'],
             [],
             ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])
示例#31
0
 def test_invalid_bytes(self):
     # \x80 is invalid utf-8
     s = """<div CLASS='\x80'>text in div</div><p>invalid bytes \x80</p>"""
     blocks = Blockifier.blockify(s, encoding='utf-8')
     self.block_output_tokens(blocks, [['text', 'in', 'div']])
     self.css_output_tokens(blocks, 'class', [['\xc2\x80']])