def _latinize_internal(text, ascii=False):
        if ascii:
            if not hasattr(latinize_text, '_ascii'):
                # Transform to latin, separate accents, decompose, remove
                # symbols, compose, push to ASCII
                latinize_text._ascii = Transliterator.createInstance(
                    'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII'
                )  # noqa
            return latinize_text._ascii.transliterate(text)

        if not hasattr(latinize_text, '_tr'):
            latinize_text._tr = Transliterator.createInstance('Any-Latin')
        return latinize_text._tr.transliterate(text)
Exemplo n.º 2
0
def main(argv):
   inputfile = ''
   outputfile = ''
   try:
      opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
   except getopt.GetoptError:
      print 'test.py -i <inputfile> -o <outputfile>'
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print 'zg-my.py -i <inputfile> -o <outputfile>'
         sys.exit()
      elif opt in ("-i", "--ifile"):
         inputfile = arg
         outputfile = "converted_" + inputfile
      elif opt in ("-o", "--ofile"):
         outputfile = arg

   print 'Input file is ', inputfile
   print 'Output file is ', outputfile

   uni = Transliterator.createInstance('Zawgyi-my')

   f = open(inputfile, "r")

   converted = uni.transliterate(f.read())

   f.close()

   fo = open(outputfile, "w")
   fo.write(converted.encode('utf8'))
   fo.close()
Exemplo n.º 3
0
def compose_nfc(text):
    """Perform unicode composition."""
    if text is None:
        return None
    if not hasattr(compose_nfc, '_tr'):
        compose_nfc._tr = Transliterator.createInstance('Any-NFC')
    return compose_nfc._tr.transliterate(text)
Exemplo n.º 4
0
def decompose_nfkd(text):
    """Perform unicode compatibility decomposition.

    This will replace some non-standard value representations in unicode and
    normalise them, while also separating characters and their diacritics into
    two separate codepoints.
    """
    if text is None:
        return None
    if not hasattr(decompose_nfkd, '_tr'):
        decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD')
    return decompose_nfkd._tr.transliterate(text)
Exemplo n.º 5
0
def latinize_text(text, ascii=False):
    """Transliterate the given text to the latin script.

    This attempts to convert a given text to latin script using the
    closest match of characters vis a vis the original script.
    """
    if text is None or not isinstance(text, six.string_types) or not len(text):
        return text

    if ascii:
        if not hasattr(latinize_text, '_ascii'):
            # Transform to latin, separate accents, decompose, remove
            # symbols, compose, push to ASCII
            latinize_text._ascii = Transliterator.createInstance(
                'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII'
            )  # noqa
        return latinize_text._ascii.transliterate(text)

    if not hasattr(latinize_text, '_tr'):
        latinize_text._tr = Transliterator.createInstance('Any-Latin')
    return latinize_text._tr.transliterate(text)
Exemplo n.º 6
0
def make_transliterator(script):
    try:
        from icu import Transliterator
        inst = Transliterator.createInstance(script)
        return inst.transliterate
    except ImportError:
        from text_unidecode import unidecode
        warnings.warn("Install 'pyicu' for better text transliteration.",
                      ICUWarning,
                      stacklevel=4)  # noqa

        def transliterate(text):
            text = compose_nfkc(text)
            return unidecode(text)

        return transliterate
Exemplo n.º 7
0
def make_trans(script: str) -> Callable[[str], Optional[str]]:
    try:
        from icu import Transliterator  # type: ignore

        inst = Transliterator.createInstance(script)
        return cast(Callable[[str], str], inst.transliterate)
    except ImportError:
        from text_unidecode import unidecode  # type: ignore

        warnings.warn("Install 'pyicu' for better text transliteration.",
                      ICUWarning,
                      stacklevel=4)  # noqa

        def transliterate(text: str) -> Optional[str]:
            clean = compose_nfkc(text)
            if clean is None:
                return None
            return cast(Optional[str], unidecode(clean))

        return transliterate
    def _getTransliterator(self, name):

        return Transliterator.createInstance(name, UTransDirection.FORWARD)
Exemplo n.º 9
0
# -*- coding: utf-8 -*-
"""
Transliterating text to International Phonetic Alphabet (IPA)
Using International Components for Unicode (ICU)
https://github.com/ovalhub/pyicu
"""
from icu import Transliterator

_ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin")


# ถอดเสียงภาษาไทยเป็นอักษรละติน
def transliterate(text: str) -> str:
    """
    Use ICU (International Components for Unicode) for transliteration
    ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน
    :param str text: Thai text to be transliterated.
    :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced.
    """
    return _ICU_THAI_TO_LATIN.transliterate(text)
Exemplo n.º 10
0
import os
import re
import six
import yaml
from icu import Transliterator


DATA_PAGE = 10000
WS_PATTERN = re.compile('\s+')
tr = Transliterator.createInstance('Any-Latin')


def resolve_includes(file_path, data):
    """Handle include statements in the configuration file."""
    if isinstance(data, (list, tuple, set)):
        data = [resolve_includes(file_path, i) for i in data]
    elif isinstance(data, dict):
        include_paths = data.pop('include', [])
        if not isinstance(include_paths, (list, tuple, set)):
            include_paths = [include_paths]
        for include_path in include_paths:
            dir_prefix = os.path.dirname(file_path)
            include_path = os.path.join(dir_prefix, include_path)
            data.update(load_config_file(include_path))
        for key, value in data.items():
            data[key] = resolve_includes(file_path, value)
    return data


def load_config_file(file_path):
    """Load a YAML (or JSON) model configuration file."""
Exemplo n.º 11
0
    def _getTransliterator(self, name):

        return Transliterator.createInstance(name, UTransDirection.FORWARD)
Exemplo n.º 12
0
 def _decompose_nfkd(text):
     if not hasattr(_decompose_nfkd, '_tr'):
         _decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD')
     return _decompose_nfkd._tr.transliterate(text)
Exemplo n.º 13
0
 def _compose_nfc(text):
     if not hasattr(_compose_nfc, '_tr'):
         _compose_nfc._tr = Transliterator.createInstance('Any-NFC')
     return _compose_nfc._tr.transliterate(text)
Exemplo n.º 14
0
flags.DEFINE_bool("build_fasttext", False, "build fasttext features")
flags.DEFINE_bool("build_tfrecord", False,
                  "build tensorflow record input files")
flags.DEFINE_integer("nrows", 100, "The TOP number of rows to query")

prog = re.compile("[\\W\\d]", re.UNICODE)
prog_with_digits = re.compile("[\\W]", re.UNICODE)

stemmer = SnowballStemmer("russian", ignore_stopwords=True)

float_prog = re.compile(r"[-+]?\d*\.\d+|\d+", re.UNICODE)
dot_prog = re.compile(r'[xх*]', re.UNICODE)

TransTable = str.maketrans(dict.fromkeys(r'~/-\[\]()|{}:^+', ' '))
wt = WordTokenizer()
trans = Transliterator.createInstance('Latin-Cyrillic')

unit_lookup = {
    'г': 'грамм',
    'грам': 'грамм',
    'гр': 'грамм',
    'грамм': 'грамм',
    'gr': 'грамм',
    'ml': 'мл',
    'милл': 'мл',
    'млитр': 'мл',
    'млтр': 'мл',
    'мл': 'мл',
    'ш': 'шт',
    'шт': 'шт',
    'тон': 'тонна',