Exemplo n.º 1
0
    def test_html_serialize(self):
        chunks = budou.ChunkList([
            budou.Chunk('Hello'),
            budou.Chunk.space(),
            budou.Chunk(u'今天'),
            budou.Chunk(u'天气'),
            budou.Chunk(u'很好')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hello '
                    u'<span class="foo">今天</span>'
                    u'<span class="foo">天气</span>'
                    u'<span class="foo">很好</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes)
        self.assertEqual(result, expected,
                         'The chunks should be compiled to a HTML code.')

        chunks = budou.ChunkList([
            budou.Chunk('Hey<'),
            budou.Chunk('<script>alert(1)</script>'),
            budou.Chunk('>guys')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hey&lt;&lt;script&gt;alert(1)&lt;/script&gt;&gt;guys'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes)
        self.assertEqual(result, expected,
                         'HTML tags included in a chunk should be encoded.')
Exemplo n.º 2
0
    def test_html_serialize(self):
        chunks = budou.ChunkList([
            budou.Chunk('Hello'),
            budou.Chunk.space(),
            budou.Chunk(u'今天'),
            budou.Chunk(u'天气'),
            budou.Chunk(u'很好')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hello '
                    u'<span class="foo">今天</span>'
                    u'<span class="foo">天气</span>'
                    u'<span class="foo">很好</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, None)
        self.assertEqual(result, expected,
                         'The chunks should be compiled to a HTML code.')

        chunks = budou.ChunkList([
            budou.Chunk('Hey<'),
            budou.Chunk('<script>alert(1)</script>'),
            budou.Chunk('>guys')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hey&lt;&lt;script&gt;alert(1)&lt;/script&gt;&gt;guys'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, None)
        self.assertEqual(result, expected,
                         'HTML tags included in a chunk should be encoded.')

        chunks = budou.ChunkList([
            budou.Chunk(u'去年'),
            budou.Chunk(u'インフルエンザに'),
            budou.Chunk(u'かかった。')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    u'<span class="foo">去年</span>'
                    u'インフルエンザに'
                    u'<span class="foo">かかった。</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, 6)
        self.assertEqual(
            result, expected,
            'Chunks that exceed the max length should not be enclosed by a span.'
        )
Exemplo n.º 3
0
    def test_group_chunks_by_entities(self):
        # chunks: foo bar baz
        # entity: ___ bar ___
        chunks = budou.ChunkList(
            [budou.Chunk('foo'),
             budou.Chunk('bar'),
             budou.Chunk('baz')])
        entities = [{'beginOffset': 3, 'content': 'bar'}]
        expected = ['foo', 'bar', 'baz']
        result = self.parser._group_chunks_by_entities(chunks, entities)
        self.assertEqual(expected, [chunk.word for chunk in result])

        # chunks: foo bar baz
        # entity: foo ba_ ___
        chunks = budou.ChunkList(
            [budou.Chunk('foo'),
             budou.Chunk('bar'),
             budou.Chunk('baz')])
        entities = [{'beginOffset': 0, 'content': 'fooba'}]
        expected = ['foobar', 'baz']
        result = self.parser._group_chunks_by_entities(chunks, entities)
        self.assertEqual(expected, [chunk.word for chunk in result])
Exemplo n.º 4
0
 def test_html_serialize(self):
     chunks = budou.ChunkList([
         budou.Chunk('a'),
         budou.Chunk('b'),
         budou.Chunk.space(),
         budou.Chunk('c')
     ])
     attributes = {'class': 'foo'}
     expected = ('<span>'
                 '<span class="foo">a</span>'
                 '<span class="foo">b</span> '
                 '<span class="foo">c</span>'
                 '</span>')
     result = self.parser._html_serialize(chunks, attributes)
     self.assertEqual(result, expected,
                      'The chunks should be compiled to a HTML code.')
Exemplo n.º 5
0
    def test_concatenate_inner(self):
        chunks = budou.ChunkList()
        chunks.append(budou.Chunk('ab', dependency=None))
        chunks.append(budou.Chunk('cde', dependency=True))
        chunks.append(budou.Chunk('fgh', dependency=False))

        chunks = self.parser._concatenate_inner(chunks, True)
        self.assertEqual(['ab', 'cdefgh'], [
            chunk.word for chunk in chunks
        ], 'Chunks should be concatenated if they depends on the following word.'
                         )
        self.assertEqual(
            [None, False], [chunk.dependency for chunk in chunks],
            'Dependency should persist even if it\'s concatenated by others.')

        chunks = self.parser._concatenate_inner(chunks, False)
        self.assertEqual(['abcdefgh'], [
            chunk.word for chunk in chunks
        ], 'Chunks should be concatenated if they depends on the previous word.'
                         )
Exemplo n.º 6
0
 def setUp(self):
     chunks = budou.ChunkList()
     chunks.append(budou.Chunk('ab', dependency=None))
     chunks.append(budou.Chunk('cde', dependency=True))
     chunks.append(budou.Chunk('fgh', dependency=False))
     self.chunks = chunks