示例#1
0
  def test_process_with_aria(self):
    """Demonstrates advanced usage considering accessibility."""
    expected_chunks = [
        budou.Chunk(u'今日は', u'NOUN', u'NN', True),
        budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False)
    ]

    expected_html_code = (
        u'<span aria-describedby="parent" class="text-chunk">今日は</span>'
        u'<span aria-describedby="parent" class="text-chunk">晴れ。</span>')

    result = self.parser.parse(DEFAULT_SENTENCE, {
        'aria-describedby': 'parent',
        'class': 'text-chunk'
        }, use_cache=False)

    self.assertIn(
        'chunks', result,
        'Processed result should include chunks.')
    self.assertIn(
        'html_code', result,
        'Processed result should include organized html code.')
    self.assertEqual(
        expected_chunks, result['chunks'],
        'Processed result should include expected chunks.')
    self.assertEqual(
        expected_html_code, result['html_code'],
        'Processed result should include expected html code.')
示例#2
0
 def setUp(self):
     queue = budou.ChunkQueue()
     chunks = [
         budou.Chunk('ab', dependency=None),
         budou.Chunk('cde', dependency=True),
         budou.Chunk('fgh', dependency=False)
     ]
     for chunk in chunks:
         queue.add(chunk)
     self.queue = queue
示例#3
0
 def test_get_source_chunks(self):
     expected = [
         budou.Chunk(u'今日', u'NOUN', u'NN', True),
         budou.Chunk(u'は', u'PRT', u'PRT', False),
         budou.Chunk(u'晴れ', u'NOUN', u'ROOT', False),
         budou.Chunk(u'。', u'PUNCT', u'P', False),
     ]
     result = self.parser._get_source_chunks(DEFAULT_SENTENCE)
     self.assertEqual(
         expected, result,
         'Input sentence should be processed into source chunks.')
示例#4
0
 def test_get_chunks_per_space(self):
     source = 'a b'
     expected = [
         budou.Chunk('a', None, None, True),
         budou.Chunk(' ', budou.SPACE_POS, budou.SPACE_POS, True),
         budou.Chunk('b', None, None, True)
     ]
     result = self.parser._get_chunks_per_space(source)
     self.assertEqual(
         result, expected,
         'Input text should be parsed into chunks separated by spaces.')
示例#5
0
 def test_spanize(self):
     chunks = [
         budou.Chunk(u'a', None, None, None),
         budou.Chunk(u'b', None, None, None),
         budou.Chunk(u'c', None, None, None),
     ]
     classname = 'foo'
     expected = (u'<span class="foo">a</span>'
                 '<span class="foo">b</span>'
                 '<span class="foo">c</span>')
     result = self.parser._spanize(chunks, classname)
     self.assertEqual(result, expected,
                      'The chunks should be compiled to a HTML code.')
示例#6
0
 def test_concatenate_punctuations(self):
     chunks = [
         budou.Chunk(u'a', None, None, None),
         budou.Chunk(u'b', u'PUNCT', None, None),
         budou.Chunk(u'c', None, None, None),
     ]
     expected_forward_concat = [
         budou.Chunk(u'ab', None, None, None),
         budou.Chunk(u'c', None, None, None),
     ]
     result = self.parser._concatenate_punctuations(chunks)
     self.assertEqual(result, expected_forward_concat,
                      'Punctuation marks should be concatenated backward.')
示例#7
0
    def test_html_serialize(self):
        chunks = budou.ChunkList([
            budou.Chunk('Hello'),
            budou.Chunk.space(),
            budou.Chunk(u'今天'),
            budou.Chunk(u'天气'),
            budou.Chunk(u'很好')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hello '
                    u'<span class="foo">今天</span>'
                    u'<span class="foo">天气</span>'
                    u'<span class="foo">很好</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, None)
        self.assertEqual(result, expected,
                         'The chunks should be compiled to a HTML code.')

        chunks = budou.ChunkList([
            budou.Chunk('Hey<'),
            budou.Chunk('<script>alert(1)</script>'),
            budou.Chunk('>guys')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hey&lt;&lt;script&gt;alert(1)&lt;/script&gt;&gt;guys'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, None)
        self.assertEqual(result, expected,
                         'HTML tags included in a chunk should be encoded.')

        chunks = budou.ChunkList([
            budou.Chunk(u'去年'),
            budou.Chunk(u'インフルエンザに'),
            budou.Chunk(u'かかった。')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    u'<span class="foo">去年</span>'
                    u'インフルエンザに'
                    u'<span class="foo">かかった。</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes, 6)
        self.assertEqual(
            result, expected,
            'Chunks that exceed the max length should not be enclosed by a span.'
        )
示例#8
0
 def test_swap(self):
     old_chunks = self.queue.chunks[0:2]
     new_chunk = budou.Chunk('ijk')
     self.queue.swap(old_chunks, new_chunk)
     expected = ['ijk', 'fgh']
     self.assertEqual(expected, [chunk.word for chunk in self.queue.chunks],
                      'Old chunks should be replaced with the new chunk.')
示例#9
0
 def test_migrate_html(self):
     source = u'こ<a>ちらを</a>クリック'
     dom = html.fragment_fromstring(source, create_parent='body')
     chunks = [
         budou.Chunk(u'こちら', u'PRON', u'NSUBJ', True),
         budou.Chunk(u'を', u'PRT', u'PRT', False),
         budou.Chunk(u'クリック', u'NOUN', u'ROOT', False),
     ]
     expected = [
         budou.Chunk(u'こ<a>ちらを</a>', budou.HTML_POS, budou.HTML_POS, True),
         budou.Chunk(u'クリック', u'NOUN', u'ROOT', False),
     ]
     result = self.parser._migrate_html(chunks, dom)
     self.assertEqual(
         expected, result,
         'The HTML source code should be migrated into the chunk list.')
示例#10
0
 def test_html_serialize(self):
     chunks = budou.ChunkList([
         budou.Chunk('a'),
         budou.Chunk('b'),
         budou.Chunk.space(),
         budou.Chunk('c')
     ])
     attributes = {'class': 'foo'}
     expected = ('<span>'
                 '<span class="foo">a</span>'
                 '<span class="foo">b</span> '
                 '<span class="foo">c</span>'
                 '</span>')
     result = self.parser._html_serialize(chunks, attributes)
     self.assertEqual(result, expected,
                      'The chunks should be compiled to a HTML code.')
示例#11
0
 def test_spanize(self):
     queue = budou.ChunkQueue()
     chunks = [
         budou.Chunk('a'),
         budou.Chunk('b'),
         budou.Chunk.space(),
         budou.Chunk('c'),
     ]
     for chunk in chunks:
         queue.add(chunk)
     attributes = {'class': 'foo'}
     expected = ('<span class="foo">a</span>'
                 '<span class="foo">b</span> '
                 '<span class="foo">c</span>')
     result = self.parser._spanize(queue, attributes)
     self.assertEqual(result, expected,
                      'The chunks should be compiled to a HTML code.')
示例#12
0
    def test_html_serialize(self):
        chunks = budou.ChunkList([
            budou.Chunk('Hello'),
            budou.Chunk.space(),
            budou.Chunk(u'今天'),
            budou.Chunk(u'天气'),
            budou.Chunk(u'很好')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hello '
                    u'<span class="foo">今天</span>'
                    u'<span class="foo">天气</span>'
                    u'<span class="foo">很好</span>'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes)
        self.assertEqual(result, expected,
                         'The chunks should be compiled to a HTML code.')

        chunks = budou.ChunkList([
            budou.Chunk('Hey<'),
            budou.Chunk('<script>alert(1)</script>'),
            budou.Chunk('>guys')
        ])
        attributes = {'class': 'foo'}
        expected = ('<span>'
                    'Hey&lt;&lt;script&gt;alert(1)&lt;/script&gt;&gt;guys'
                    '</span>')
        result = self.parser._html_serialize(chunks, attributes)
        self.assertEqual(result, expected,
                         'HTML tags included in a chunk should be encoded.')
示例#13
0
    def test_concatenate_inner(self):
        chunks = budou.ChunkList()
        chunks.append(budou.Chunk('ab', dependency=None))
        chunks.append(budou.Chunk('cde', dependency=True))
        chunks.append(budou.Chunk('fgh', dependency=False))

        chunks = self.parser._concatenate_inner(chunks, True)
        self.assertEqual(['ab', 'cdefgh'], [
            chunk.word for chunk in chunks
        ], 'Chunks should be concatenated if they depends on the following word.'
                         )
        self.assertEqual(
            [None, False], [chunk.dependency for chunk in chunks],
            'Dependency should persist even if it\'s concatenated by others.')

        chunks = self.parser._concatenate_inner(chunks, False)
        self.assertEqual(['abcdefgh'], [
            chunk.word for chunk in chunks
        ], 'Chunks should be concatenated if they depends on the previous word.'
                         )
示例#14
0
    def test_process(self):
        expected_chunks = [
            budou.Chunk(u'今日は', u'NOUN', u'NN', True),
            budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False)
        ]

        expected_html_code = (u'<span class="ww">今日は</span>'
                              u'<span class="ww">晴れ。</span>')

        result = self.parser.parse(DEFAULT_SENTENCE)

        self.assertIn('chunks', result,
                      'Processed result should include chunks.')
        self.assertIn('html_code', result,
                      'Processed result should include organized html code.')
        self.assertEqual(expected_chunks, result['chunks'],
                         'Processed result should include expected chunks.')
        self.assertEqual(
            expected_html_code, result['html_code'],
            'Processed result should include expected html code.')
示例#15
0
    def test_parse_ja(self):
        """Demonstrates standard usage in Japanese."""
        expected_chunks = [
            budou.Chunk(u'今日は', u'NOUN', u'NN', True),
            budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False)
        ]

        expected_html_code = (u'<span class="ww">今日は</span>'
                              u'<span class="ww">晴れ。</span>')

        result = self.parser.parse(DEFAULT_SENTENCE_JA,
                                   language='ja',
                                   use_cache=False)

        self.assertEqual(
            expected_chunks, result['chunks'],
            'Processed result should include expected chunks in Japanese.')
        self.assertEqual(
            expected_html_code, result['html_code'],
            'Processed result should include expected html code in Japanese.')
示例#16
0
    def test_parse_ko(self):
        """Demonstrates standard usage in Japanese."""
        expected_chunks = [
            budou.Chunk(u'오늘은', None, None, True),
            budou.Chunk(' ', budou.SPACE_POS, budou.SPACE_POS, True),
            budou.Chunk(u'맑음.', None, None, True)
        ]

        expected_html_code = (u'<span class="ww">오늘은</span> '
                              u'<span class="ww">맑음.</span>')

        result = self.parser.parse(DEFAULT_SENTENCE_KO,
                                   language='ko',
                                   use_cache=False)

        self.assertEqual(
            expected_chunks, result['chunks'],
            'Processed result should include expected chunks in Korean.')
        self.assertEqual(
            expected_html_code, result['html_code'],
            'Processed result should include expected html code in Korean.')
示例#17
0
    def test_maybe_add_dependency(self):
        chunk = budou.Chunk('foo', label=None)
        chunk.maybe_add_dependency(True)
        self.assertEqual(
            None, chunk.dependency,
            'Dependency should not be added if the chunk does not belong to'
            'dependent labels.')

        chunk = budou.Chunk('foo', label=budou.Chunk.DEPENDENT_LABEL[0])
        chunk.maybe_add_dependency(True)
        self.assertEqual(
            True, chunk.dependency,
            'Dependency should be added if the chunk belongs to dependent labels.'
        )

        chunk = budou.Chunk('foo', label=budou.Chunk.DEPENDENT_LABEL[0])
        chunk.dependency = False
        chunk.maybe_add_dependency(True)
        self.assertEqual(
            False, chunk.dependency,
            'Dependency should not be added if the chunk has dependency already.'
        )
示例#18
0
 def test_add_dependency_if_punct(self):
     test_characters = [
         u'。', u'、', u'「', u'」', u'(', u')', u'[', u']', u'(', u')'
     ]
     expected_dependency = [
         False, False, True, False, True, False, True, False, True, False
     ]
     for i, character in enumerate(test_characters):
         # _add_dependency_if_punct is called in __init__ implicitly.
         chunk = budou.Chunk(character, pos='PUNCT')
         self.assertEqual(
             expected_dependency[i], chunk.dependency,
             'Punctuation marks should be assigned with proper dependencies.'
         )
示例#19
0
 def test_concatenate_by_label(self):
     chunks = [
         budou.Chunk(u'a', None, budou.TARGET_LABEL[0], True),
         budou.Chunk(u'b', None, budou.TARGET_LABEL[1], False),
         budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True),
     ]
     expected_forward_concat = [
         budou.Chunk(u'ab', None, budou.TARGET_LABEL[1], False),
         budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True),
     ]
     result = self.parser._concatenate_by_label(chunks, True)
     self.assertEqual(
         result, expected_forward_concat,
         'Forward directional chunks should be concatenated to following '
         'chunks.')
     expected_backward_concat = [
         budou.Chunk(u'ab', None, budou.TARGET_LABEL[0], True),
         budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True),
     ]
     result = self.parser._concatenate_by_label(chunks, False)
     self.assertEqual(
         result, expected_backward_concat,
         'Backward directional chunks should be concatenated to preceding '
         'chunks.')
示例#20
0
    def test_group_chunks_by_entities(self):
        # chunks: foo bar baz
        # entity: ___ bar ___
        chunks = budou.ChunkList(
            [budou.Chunk('foo'),
             budou.Chunk('bar'),
             budou.Chunk('baz')])
        entities = [{'beginOffset': 3, 'content': 'bar'}]
        expected = ['foo', 'bar', 'baz']
        result = self.parser._group_chunks_by_entities(chunks, entities)
        self.assertEqual(expected, [chunk.word for chunk in result])

        # chunks: foo bar baz
        # entity: foo ba_ ___
        chunks = budou.ChunkList(
            [budou.Chunk('foo'),
             budou.Chunk('bar'),
             budou.Chunk('baz')])
        entities = [{'beginOffset': 0, 'content': 'fooba'}]
        expected = ['foobar', 'baz']
        result = self.parser._group_chunks_by_entities(chunks, entities)
        self.assertEqual(expected, [chunk.word for chunk in result])
示例#21
0
 def setUp(self):
     chunks = budou.ChunkList()
     chunks.append(budou.Chunk('ab', dependency=None))
     chunks.append(budou.Chunk('cde', dependency=True))
     chunks.append(budou.Chunk('fgh', dependency=False))
     self.chunks = chunks
示例#22
0
 def test_get_source_chunks(self):
     queue = self.parser._get_source_chunks(DEFAULT_SENTENCE_JA)
     expected = [
         budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None),
         budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None),
         budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True),
         budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None),
         budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None),
         budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False),
         budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False)
     ]
     self.assertEqual(
         [chunk.word
          for chunk in expected], [chunk.word for chunk in queue.chunks],
         'Words should be match between input text and retrieved chunks.')
     self.assertEqual([chunk.dependency for chunk in expected], [
         chunk.dependency for chunk in queue.chunks
     ], 'Dependency should be match between input text and retrieved chunks.'
                      )
示例#23
0
 def test_get_source_chunks(self):
     tokens = [{
         'dependencyEdge': {
             'headTokenIndex': 1,
             'label': 'NN'
         },
         'partOfSpeech': {
             'tag': 'NOUN'
         },
         'text': {
             'beginOffset': 0,
             'content': u'六本木'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'ADVPHMOD'
         },
         'partOfSpeech': {
             'tag': 'NOUN'
         },
         'text': {
             'beginOffset': 3,
             'content': u'ヒルズ'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 1,
             'label': 'PRT'
         },
         'partOfSpeech': {
             'tag': 'PRT'
         },
         'text': {
             'beginOffset': 6,
             'content': u'で'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'P'
         },
         'partOfSpeech': {
             'tag': 'PUNCT'
         },
         'text': {
             'beginOffset': 7,
             'content': u'、'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 5,
             'label': 'P'
         },
         'partOfSpeech': {
             'tag': 'PUNCT'
         },
         'text': {
             'beginOffset': 8,
             'content': u'「'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'DOBJ'
         },
         'partOfSpeech': {
             'tag': 'NOUN'
         },
         'text': {
             'beginOffset': 9,
             'content': u'ご飯'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 5,
             'label': 'P'
         },
         'partOfSpeech': {
             'tag': 'PUNCT'
         },
         'text': {
             'beginOffset': 11,
             'content': u'」'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 5,
             'label': 'PRT'
         },
         'partOfSpeech': {
             'tag': 'PRT'
         },
         'text': {
             'beginOffset': 12,
             'content': u'を'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'ROOT'
         },
         'partOfSpeech': {
             'tag': 'VERB'
         },
         'text': {
             'beginOffset': 13,
             'content': u'食べ'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'AUX'
         },
         'partOfSpeech': {
             'tag': 'VERB'
         },
         'text': {
             'beginOffset': 15,
             'content': u'ます'
         }
     }, {
         'dependencyEdge': {
             'headTokenIndex': 8,
             'label': 'P'
         },
         'partOfSpeech': {
             'tag': 'PUNCT'
         },
         'text': {
             'beginOffset': 17,
             'content': u'。'
         }
     }]
     self.parser._get_annotations = MagicMock(return_value=(tokens, None))
     chunks, _, _ = self.parser._get_source_chunks(u'六本木ヒルズで、「ご飯」を食べます。')
     expected = [
         budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None),
         budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None),
         budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True),
         budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None),
         budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None),
         budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False),
         budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False)
     ]
     self.assertEqual(
         [chunk.word
          for chunk in expected], [chunk.word for chunk in chunks],
         'Words should be match between input text and retrieved chunks.')
     self.assertEqual([chunk.dependency for chunk in expected], [
         chunk.dependency for chunk in chunks
     ], 'Dependency should be match between input text and retrieved chunks.'
                      )
示例#24
0
 def reset_queue(self):
     chunks = [budou.Chunk('foo'), budou.Chunk('bar'), budou.Chunk('baz')]
     queue = budou.ChunkQueue()
     for chunk in chunks:
         queue.add(chunk)
     return queue
示例#25
0
 def test_get_source_chunks(self):
     budou.api.get_annotations = MagicMock(
         return_value=self.cases['ja-case1']['tokens'])
     queue = self.parser._get_source_chunks(
         self.cases['ja-case1']['sentence'])
     expected = [
         budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None),
         budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None),
         budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True),
         budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None),
         budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False),
         budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False),
         budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None),
         budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False),
         budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False)
     ]
     self.assertEqual(
         [chunk.word
          for chunk in expected], [chunk.word for chunk in queue.chunks],
         'Words should be match between input text and retrieved chunks.')
     self.assertEqual([chunk.dependency for chunk in expected], [
         chunk.dependency for chunk in queue.chunks
     ], 'Dependency should be match between input text and retrieved chunks.'
                      )