def test_init_charmap_invalid6(self): """Test that an invalid key (neither a single Unicode character nor a Unicode character range) in charMap raises a InvalidCharMapKeyError. """ with pytest.raises(InvalidCharMapKeyError): CharMapper({u'a--': u'Hello'})
def test_init_charmap_invalid1(self): """Test that an invalid key (byte string) type in charMap raises a TypeError. """ with pytest.raises(TypeError): CharMapper({b'a': 'Hello'})
def test_init_charpap_invalid7(self): """Test that an invalid key (neither a single Unicode character nor a Unicode character range) in charMap raises a InvalidCharMapKeyError. """ with pytest.raises(TypeError): CharMapper({'--a': b'Hello'})
def test_mapstring_empty_string(self): """Test that an empty string causes the map_string method to return an empty string. """ mapper = CharMapper(VALID_MAP) assert mapper.map_string('') == ''
def test_init_default_not_valid2(self): """Test that an invalid type (byte string) for default raises a TypeError. """ with pytest.raises(TypeError): CharMapper({}, b'Hello')
def test_init_charmap_invalid2(self): """Test that an invalid value type (byte string) for a valid key (single Unicode character) in charMap raises a TypeError. """ with pytest.raises(TypeError): CharMapper({'a': b'Hello'})
def test_init_charmap_invalid3(self): """Test that an invalid value type (byte string) for an invalid key (Unicode character range with wrong order) in charMap raises a InvalidCharMapKeyError. """ with pytest.raises(InvalidCharMapKeyError): CharMapper({'c-a': b'Hello'})
def test_mapstring_not_unicode(self): """Test that a non-unicode string causes the map_string method to raise a TypeError. """ with pytest.raises(TypeError): mapper = CharMapper(VALID_MAP) mapper.map_string(b'Hello, world!')
def test_mapstring_none(self): """Test that a None value causes the map_string method to raise a TypeError. """ with pytest.raises(TypeError): mapper = CharMapper(VALID_MAP) mapper.map_string(None)
def test_init_charmap_invalid4(self): """Test that an invalid value type (byte string) for an invalid key (neither a single Unicode character nor a Unicode character range) in charMap raises a InvalidCharMapKeyError. """ with pytest.raises(InvalidCharMapKeyError): CharMapper({'cdsn': b'Hello'})
def test_init_none(self): """Test that init with None raises a TypeError. """ with pytest.raises(TypeError): CharMapper(None)
def test_mapstring_arabic(self): """Test that a map_string properly maps an Arabic unicode string. """ mapper = CharMapper(VALID_MAP) assert mapper.map_string('٠١٢٣٤٥٦٧٨٩') == '012---++++'
def test_mapstring_english(self): """Test that a map_string properly maps an English unicode string. """ mapper = CharMapper(VALID_MAP) assert mapper.map_string('Hello, world!') == 'Hu**o, wor*m!'
# Identify No Analysis marker _NOAN_RE = re.compile(u'NOAN') _COPY_FEATS = frozenset([ 'gloss', 'atbtok', 'atbseg', 'd1tok', 'd1seg', 'd2tok', 'd2seg', 'd3tok', 'd3seg' ]) _UNDEFINED_LEX_FEATS = frozenset(['root', 'pattern', 'caphi']) DEFAULT_NORMALIZE_MAP = CharMapper({ u'\u0625': u'\u0627', u'\u0623': u'\u0627', u'\u0622': u'\u0627', u'\u0671': u'\u0627', u'\u0649': u'\u064a', u'\u0629': u'\u0647', u'\u0640': u'' }) """:obj:`~camel_tools.utils.charmap.CharMapper`: The default character map used for normalization by :obj:`CalimaStarAnalyzer`. Removes the tatweel/kashida character and does the following conversions: - 'إ' to 'ا' - 'أ' to 'ا' - 'آ' to 'ا' - 'ٱ' to 'ا' - 'ى' to 'ي' - 'ة' to 'ه'
def test_init_default_valid2(self): """Test that a Unicode string type for default doesn't raise an Exception. """ assert CharMapper({}, 'Hello')
def test_init_not_dict(self): """Test that a non-dict object (list) raises a TypeError. """ with pytest.raises(TypeError): CharMapper([])
from __future__ import absolute_import import pytest from camel_tools.utils.charmap import CharMapper from camel_tools.utils.transliterate import Transliterator # A mapper that translates lower-case English characters to a lower-case x and # upper-case English characters to an upper-case X. This makes it easy to # predict what the transliteration should be. TEST_MAP = { u'A-Z': u'X', u'a-z': u'x', } TEST_MAPPER = CharMapper(TEST_MAP, None) class TestTransliteratorInit(object): """Test class for Transliterator.__init__. """ def test_init_none_mapper(self): """Test that init raises a TypeError when given a mapper that is None. """ with pytest.raises(TypeError): Transliterator(None) def test_init_invalid_type_mapper(self): """Test that init raises a TypeError when given a mapper that is not a CharMapper instance.
def test_init_charmap_valid3(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({u'a-f': u''})
def test_init_charmap_valid5(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({'--a': ''})
def test_init_charmap_valid4(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({'a-f': '', 'b': None}, 'Hello')
def test_init_empty_dict(self): """Test that init with an empty dict doesn't raise an exception. """ assert CharMapper({})
def test_init_charmap_valid2(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({u'a': None})
def test_init_dictlike_object(self): """Test that init with an dict-like object doesn't raise an exception. """ assert CharMapper(AnotherMapping())
def test_init_default_valid1(self): """Test that a None type for default doesn't raise an Exception. """ assert CharMapper({}, None)
def test_init_default_not_valid1(self): """Test that an invalid type (list) for default raises a TypeError. """ with pytest.raises(TypeError): CharMapper({}, [])
"""This module provides functions for normalizing Arabic text. """ import re import unicodedata from camel_tools.utils.charmap import CharMapper _ALEF_NORMALIZE_BW_RE = re.compile(u'[<>{|]') _ALEF_NORMALIZE_SAFEBW_RE = re.compile(u'[IOLM]') _ALEF_NORMALIZE_XMLBW_RE = re.compile(u'[IO{|]') _ALEF_NORMALIZE_HSB_RE = re.compile(u'[\u0102\u00c2\u00c4\u0100]') _ALEF_NORMALIZE_AR_RE = re.compile(u'[\u0625\u0623\u0671\u0622]') _UNICODE_CHAR_FIX = CharMapper({ '\ufdfc': 'ريال', '\ufdfd': 'بسم الله الرحمن الرحيم', }) def normalize_unicode(s, compatibility=True): """Normalize Unicode strings into their canonically composed form or (i.e. characters that can be written as a combination of unicode characters are converted to their single character form). Note: This is essentially a call to :func:`unicodedata.normalize` with form 'NFC' if **compatibility** is False or 'NFKC' if it's True. Args: s (:obj:`str`): The string to be normalized. compatibility (:obj:`bool`, optional): Apply compatibility decomposition. Defaults to True.