Пример #1
0
    def test_small_doc(self):
        # need an instance to call analyze
        koh = Kohlschuetter()

        self.assertEqual((None, []), Kohlschuetter.make_features('<html></html>'))
        self.assertEqual('', koh.analyze('<html></html>'))

        s = '<html> <p>a</p> <div>b</div> </html>'
        features, blocks = Kohlschuetter.make_features(s)
        self.assertTrue(features is None)
        self.block_output_tokens(blocks, [['a'], ['b']])
        self.assertEqual('a b', koh.analyze(s))
Пример #2
0
    def test_make_features(self):
        s = '<html> <p>first </p> <div> <p>second block with <a href=''>anchor</a> </p> <p>the third block</p> </div> </html>'
        features, blocks = Kohlschuetter.make_features(s)
        self.block_output_tokens(blocks, [['first'], ['second', 'block', 'with', 'anchor'], ['the', 'third', 'block']])
        self.link_output_tokens(blocks, [[], ['anchor'], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        self.assertTrue(np.allclose(features[0, :], [0.0, 0.0, link_density[0], text_density[0], link_density[1], text_density[1]]))
        self.assertTrue(np.allclose(features[1, :], [link_density[0], text_density[0], link_density[1], text_density[1], link_density[2], text_density[2]]))
        self.assertTrue(np.allclose(features[2, :], [link_density[1], text_density[1], link_density[2], text_density[2], 0.0, 0.0]))