示例#1
0
    def test_init_default_valid2(self):
        """Test that a Unicode string type for default doesn't raise an
        Exception.
        """

        assert CharMapper({}, 'Hello')
示例#2
0
    def test_init_default_not_valid1(self):
        """Test that an invalid type (list) for default raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper({}, [])
示例#3
0
    def test_init_default_valid1(self):
        """Test that a None type for default doesn't raise an Exception.
        """

        assert CharMapper({}, None)
示例#4
0
    def test_builtinmapper_xmlbw2hsb(self):
        """Test that the builtin 'xmlbw2hsb' scheme is loaded without
        errors.
        """

        assert CharMapper.builtin_mapper('xmlbw2hsb')
示例#5
0
 def __init__(self) -> None:
     super().__init__()
     self.sec_cleaner = None
     self.clean_mapper = CharMapper.builtin_mapper('arclean')
     self.ar2bw_mapper = CharMapper.builtin_mapper('ar2bw')
     self.bw2ar_mapper = CharMapper.builtin_mapper('bw2ar')
示例#6
0
    def test_mapstring_english(self):
        """Test that a map_string properly maps an English unicode string.
        """

        mapper = CharMapper(VALID_MAP)
        assert mapper.map_string('Hello, world!') == 'Hu**o, wor*m!'
示例#7
0
    def test_builtinmapper_bw2ar(self):
        """Test that the builtin 'bw2ar' scheme is loaded without errors.
        """

        assert CharMapper.builtin_mapper('bw2ar')
示例#8
0
    def test_init_charmap_valid3(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({u'a-f': u''})
示例#9
0
    def test_builtinmapper_arclean(self):
        """Test that the builtin 'arclean' scheme is loaded without errors.
        """

        assert CharMapper.builtin_mapper('arclean')
示例#10
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for scheme in _BUILTIN_SCHEMES:
                print("{}   {}".format(scheme[0].ljust(20), scheme[1]))
            sys.exit(0)

        if arguments['--scheme'] is not None:
            if arguments['--scheme'] not in [s[0] for s in _BUILTIN_SCHEMES]:
                sys.stderr.write('Error: {} is not a valid scheme.\n'
                                 'Run `camel_transliterate -l` to see the list'
                                 ' of available schemes.'
                                 '\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            if arguments['--marker'] is None:
                marker = '@@IGNORE@@'
            else:
                marker = arguments['--marker']

            ignore_markers = arguments['--ignore-markers']
            strip_markers = arguments['--strip-markers']

            # Open files (or just use stdin and stdout)
            fin, fout = _open_files(arguments['FILE'], arguments['--output'])

            # Load the CharMapper and initialize a Transliterator with it
            try:
                mapper = CharMapper.builtin_mapper(arguments['--scheme'])
                trans = Transliterator(mapper, marker)
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: Could not load builtin scheme'
                                 ' {}.\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            # Transliterate lines
            try:
                for line in fin:
                    line = force_unicode(line)

                    if six.PY3:
                        fout.write(
                            trans.transliterate(line, strip_markers,
                                                ignore_markers))
                    else:
                        fout.write(
                            force_encoding(
                                trans.transliterate(line, strip_markers,
                                                    ignore_markers)))
                fout.flush()

            # If everything worked so far, this shouldn't happen
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: An unkown error occured during '
                                 'transliteration.\n')
                sys.exit(1)

            # Cleanup
            if arguments['FILE'] is not None:
                fin.close()
            if arguments['--output'] is not None:
                fout.close()

        sys.exit(0)
    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)
    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
示例#11
0
    def test_init_charmap_valid2(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({u'a': None})
示例#12
0
# Identify No Analysis marker
_NOAN_RE = re.compile(u'NOAN')

_COPY_FEATS = frozenset([
    'gloss', 'atbtok', 'atbseg', 'd1tok', 'd1seg', 'd2tok', 'd2seg', 'd3tok',
    'd3seg'
])

_UNDEFINED_LEX_FEATS = frozenset(['root', 'pattern', 'caphi'])

DEFAULT_NORMALIZE_MAP = CharMapper({
    u'\u0625': u'\u0627',
    u'\u0623': u'\u0627',
    u'\u0622': u'\u0627',
    u'\u0671': u'\u0627',
    u'\u0649': u'\u064a',
    u'\u0629': u'\u0647',
    u'\u0640': u''
})
""":obj:`~camel_tools.utils.charmap.CharMapper`: The default character map used
for normalization by :obj:`CalimaStarAnalyzer`.

Removes the tatweel/kashida character and does the following conversions:

- 'إ' to 'ا'
- 'أ' to 'ا'
- 'آ' to 'ا'
- 'ٱ' to 'ا'
- 'ى' to 'ي'
- 'ة' to 'ه'
from __future__ import absolute_import

import pytest

from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.transliterate import Transliterator

# A mapper that translates lower-case English characters to a lower-case x and
# upper-case English characters to an upper-case X. This makes it easy to
# predict what the transliteration should be.
TEST_MAP = {
    u'A-Z': u'X',
    u'a-z': u'x',
}
TEST_MAPPER = CharMapper(TEST_MAP, None)


class TestTransliteratorInit(object):
    """Test class for Transliterator.__init__.
    """
    def test_init_none_mapper(self):
        """Test that init raises a TypeError when given a mapper that is None.
        """

        with pytest.raises(TypeError):
            Transliterator(None)

    def test_init_invalid_type_mapper(self):
        """Test that init raises a TypeError when given a mapper that is not a
        CharMapper instance.
示例#14
0
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import subprocess
import sys
import os
import argparse
import time

from helpers.preprocess import preprocess
from helpers.tag import tag
from ai.tests.mle import train_mle, predict_mle
from camel_tools.utils.charmap import CharMapper

ar2bw = CharMapper.builtin_mapper('ar2bw')


def is_bool(s):
    return str(s) != 'False'


parser = argparse.ArgumentParser(
    description=
    'This program rewrites (transliterates) from one language script to another'
)

# --model_name can take values "mle", "word2word", "line2line", or "hybrid"
parser.add_argument('--model_name',
                    action="store",
                    dest='model_name',
示例#15
0
    def test_init_charmap_valid4(self):
        """Test that a valid charMap doesn't raise an Exception.
        """

        assert CharMapper({'a-f': '', 'b': None}, 'Hello')
示例#16
0
    def test_init_none(self):
        """Test that init with None raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper(None)
示例#17
0
 def test_init_charmap_valid5(self):
     """Test that a valid charMap doesn't raise an Exception.
     """
     assert CharMapper({'--a': ''})
示例#18
0
    def test_init_empty_dict(self):
        """Test that init with an empty dict doesn't raise an exception.
        """

        assert CharMapper({})
示例#19
0
 def test_mapstring_arabic(self):
     """Test that a map_string properly maps an Arabic unicode string.
     """
     mapper = CharMapper(VALID_MAP)
     assert mapper.map_string('٠١٢٣٤٥٦٧٨٩') == '012---++++'
示例#20
0
    def test_init_dictlike_object(self):
        """Test that init with an dict-like object doesn't raise an exception.
        """

        assert CharMapper(AnotherMapping())
示例#21
0
    def test_builtinmapper_safebw2bw(self):
        """Test that the builtin 'safebw2bw' scheme is loaded without errors.
        """

        assert CharMapper.builtin_mapper('safebw2bw')
示例#22
0
    def test_init_not_dict(self):
        """Test that a non-dict object (list) raises a TypeError.
        """

        with pytest.raises(TypeError):
            CharMapper([])
示例#23
0
    def test_builtinmapper_hsb2xmlbw(self):
        """Test that the builtin 'hsb2xmlbw' scheme is loaded without errors.
        """

        assert CharMapper.builtin_mapper('hsb2xmlbw')
示例#24
0
"""This module provides functions for normalizing Arabic text.
"""

import re
import unicodedata

from camel_tools.utils.charmap import CharMapper

_ALEF_NORMALIZE_BW_RE = re.compile(u'[<>{|]')
_ALEF_NORMALIZE_SAFEBW_RE = re.compile(u'[IOLM]')
_ALEF_NORMALIZE_XMLBW_RE = re.compile(u'[IO{|]')
_ALEF_NORMALIZE_HSB_RE = re.compile(u'[\u0102\u00c2\u00c4\u0100]')
_ALEF_NORMALIZE_AR_RE = re.compile(u'[\u0625\u0623\u0671\u0622]')

_UNICODE_CHAR_FIX = CharMapper({
    '\ufdfc': 'ريال',
    '\ufdfd': 'بسم الله الرحمن الرحيم',
})


def normalize_unicode(s, compatibility=True):
    """Normalize Unicode strings into their canonically composed form or
    (i.e. characters that can be written as a combination of unicode characters
    are converted to their single character form).

    Note: This is essentially a call to :func:`unicodedata.normalize` with
    form 'NFC' if **compatibility** is False or 'NFKC' if it's True.

    Args:
        s (:obj:`str`): The string to be normalized.
        compatibility (:obj:`bool`, optional): Apply compatibility
            decomposition. Defaults to True.