Пример #1
0
    def test_init_charmap_invalid6(self):
        """Test that an invalid key (neither a single Unicode character nor a
        Unicode character range) in charMap raises a InvalidCharMapKeyError.
        """

        with pytest.raises(InvalidCharMapKeyError):
            CharMapper({u'a--': u'Hello'})
Пример #2
0
    def test_init_charmap_invalid1(self):
        """Test that an invalid key (byte string) type in charMap raises a
        TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper({b'a': 'Hello'})
Пример #3
0
    def test_init_charpap_invalid7(self):
        """Test that an invalid key (neither a single Unicode character nor a
        Unicode character range) in charMap raises a InvalidCharMapKeyError.
        """

        with pytest.raises(TypeError):
            CharMapper({'--a': b'Hello'})
Пример #4
0
    def test_mapstring_empty_string(self):
        """Test that an empty string causes the map_string method to return an
        empty string.
        """

        mapper = CharMapper(VALID_MAP)
        assert mapper.map_string('') == ''
Пример #5
0
    def test_init_default_not_valid2(self):
        """Test that an invalid type (byte string) for default raises a
        TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper({}, b'Hello')
Пример #6
0
    def test_init_charmap_invalid2(self):
        """Test that an invalid value type (byte string) for a valid key
        (single Unicode character) in charMap raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper({'a': b'Hello'})
Пример #7
0
    def test_init_charmap_invalid3(self):
        """Test that an invalid value type (byte string) for an invalid key
        (Unicode character range with wrong order) in charMap raises a
        InvalidCharMapKeyError.
        """

        with pytest.raises(InvalidCharMapKeyError):
            CharMapper({'c-a': b'Hello'})
Пример #8
0
    def test_mapstring_not_unicode(self):
        """Test that a non-unicode string causes the map_string method to raise
        a TypeError.
        """

        with pytest.raises(TypeError):
            mapper = CharMapper(VALID_MAP)
            mapper.map_string(b'Hello, world!')
Пример #9
0
    def test_mapstring_none(self):
        """Test that a None value causes the map_string method to raise a
        TypeError.
        """

        with pytest.raises(TypeError):
            mapper = CharMapper(VALID_MAP)
            mapper.map_string(None)
Пример #10
0
    def test_init_charmap_invalid4(self):
        """Test that an invalid value type (byte string) for an invalid key
        (neither a single Unicode character nor a Unicode character range) in
        charMap raises a InvalidCharMapKeyError.
        """

        with pytest.raises(InvalidCharMapKeyError):
            CharMapper({'cdsn': b'Hello'})
Пример #11
0
    def test_init_none(self):
        """Test that init with None raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper(None)
Пример #12
0
 def test_mapstring_arabic(self):
     """Test that a map_string properly maps an Arabic unicode string.
     """
     mapper = CharMapper(VALID_MAP)
     assert mapper.map_string('٠١٢٣٤٥٦٧٨٩') == '012---++++'
Пример #13
0
    def test_mapstring_english(self):
        """Test that a map_string properly maps an English unicode string.
        """

        mapper = CharMapper(VALID_MAP)
        assert mapper.map_string('Hello, world!') == 'Hu**o, wor*m!'
Пример #14
0
# Identify No Analysis marker
_NOAN_RE = re.compile(u'NOAN')

_COPY_FEATS = frozenset([
    'gloss', 'atbtok', 'atbseg', 'd1tok', 'd1seg', 'd2tok', 'd2seg', 'd3tok',
    'd3seg'
])

_UNDEFINED_LEX_FEATS = frozenset(['root', 'pattern', 'caphi'])

DEFAULT_NORMALIZE_MAP = CharMapper({
    u'\u0625': u'\u0627',
    u'\u0623': u'\u0627',
    u'\u0622': u'\u0627',
    u'\u0671': u'\u0627',
    u'\u0649': u'\u064a',
    u'\u0629': u'\u0647',
    u'\u0640': u''
})
""":obj:`~camel_tools.utils.charmap.CharMapper`: The default character map used
for normalization by :obj:`CalimaStarAnalyzer`.

Removes the tatweel/kashida character and does the following conversions:

- 'إ' to 'ا'
- 'أ' to 'ا'
- 'آ' to 'ا'
- 'ٱ' to 'ا'
- 'ى' to 'ي'
- 'ة' to 'ه'
Пример #15
0
    def test_init_default_valid2(self):
        """Test that a Unicode string type for default doesn't raise an
        Exception.
        """

        assert CharMapper({}, 'Hello')
Пример #16
0
    def test_init_not_dict(self):
        """Test that a non-dict object (list) raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper([])
from __future__ import absolute_import

import pytest

from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.transliterate import Transliterator

# A mapper that translates lower-case English characters to a lower-case x and
# upper-case English characters to an upper-case X. This makes it easy to
# predict what the transliteration should be.
TEST_MAP = {
    u'A-Z': u'X',
    u'a-z': u'x',
}
TEST_MAPPER = CharMapper(TEST_MAP, None)


class TestTransliteratorInit(object):
    """Test class for Transliterator.__init__.
    """
    def test_init_none_mapper(self):
        """Test that init raises a TypeError when given a mapper that is None.
        """

        with pytest.raises(TypeError):
            Transliterator(None)

    def test_init_invalid_type_mapper(self):
        """Test that init raises a TypeError when given a mapper that is not a
        CharMapper instance.
Пример #18
0
    def test_init_charmap_valid3(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({u'a-f': u''})
Пример #19
0
 def test_init_charmap_valid5(self):
     """Test that a valid charMap doesn't raise an Exception.
     """
     assert CharMapper({'--a': ''})
Пример #20
0
    def test_init_charmap_valid4(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({'a-f': '', 'b': None}, 'Hello')
Пример #21
0
    def test_init_empty_dict(self):
        """Test that init with an empty dict doesn't raise an exception.
        """

        assert CharMapper({})
Пример #22
0
    def test_init_charmap_valid2(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({u'a': None})
Пример #23
0
    def test_init_dictlike_object(self):
        """Test that init with an dict-like object doesn't raise an exception.
        """

        assert CharMapper(AnotherMapping())
Пример #24
0
    def test_init_default_valid1(self):
        """Test that a None type for default doesn't raise an Exception.
        """

        assert CharMapper({}, None)
Пример #25
0
    def test_init_default_not_valid1(self):
        """Test that an invalid type (list) for default raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper({}, [])
Пример #26
0
"""This module provides functions for normalizing Arabic text.
"""

import re
import unicodedata

from camel_tools.utils.charmap import CharMapper

_ALEF_NORMALIZE_BW_RE = re.compile(u'[<>{|]')
_ALEF_NORMALIZE_SAFEBW_RE = re.compile(u'[IOLM]')
_ALEF_NORMALIZE_XMLBW_RE = re.compile(u'[IO{|]')
_ALEF_NORMALIZE_HSB_RE = re.compile(u'[\u0102\u00c2\u00c4\u0100]')
_ALEF_NORMALIZE_AR_RE = re.compile(u'[\u0625\u0623\u0671\u0622]')

_UNICODE_CHAR_FIX = CharMapper({
    '\ufdfc': 'ريال',
    '\ufdfd': 'بسم الله الرحمن الرحيم',
})


def normalize_unicode(s, compatibility=True):
    """Normalize Unicode strings into their canonically composed form or
    (i.e. characters that can be written as a combination of unicode characters
    are converted to their single character form).

    Note: This is essentially a call to :func:`unicodedata.normalize` with
    form 'NFC' if **compatibility** is False or 'NFKC' if it's True.

    Args:
        s (:obj:`str`): The string to be normalized.
        compatibility (:obj:`bool`, optional): Apply compatibility
            decomposition. Defaults to True.