def _latinize_internal(text, ascii=False): if ascii: if not hasattr(latinize_text, '_ascii'): # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII latinize_text._ascii = Transliterator.createInstance( 'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII' ) # noqa return latinize_text._ascii.transliterate(text) if not hasattr(latinize_text, '_tr'): latinize_text._tr = Transliterator.createInstance('Any-Latin') return latinize_text._tr.transliterate(text)
def __init__(self, norm_rules, trans_rules, analysis_rules): self.normalizer = Transliterator.createFromRules( "icu_normalization", norm_rules) trans_rules += ";[:Space:]+ > ' '" self.to_ascii = Transliterator.createFromRules("icu_to_ascii", trans_rules) self.search = Transliterator.createFromRules("icu_search", norm_rules + trans_rules) self.analysis = { name: arules.create(self.to_ascii, arules.config) for name, arules in analysis_rules.items() }
def compose_nfc(text): """Perform unicode composition.""" if text is None: return None if not hasattr(compose_nfc, '_tr'): compose_nfc._tr = Transliterator.createInstance('Any-NFC') return compose_nfc._tr.transliterate(text)
def test_get_transliteration_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'zg-my.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg outputfile = "converted_" + inputfile elif opt in ("-o", "--ofile"): outputfile = arg print 'Input file is ', inputfile print 'Output file is ', outputfile uni = Transliterator.createInstance('Zawgyi-my') f = open(inputfile, "r") converted = uni.transliterate(f.read()) f.close() fo = open(outputfile, "w") fo.write(converted.encode('utf8')) fo.close()
def testCustomFunctionality(self): # convert a's to b's and b's to c's rules = "a > b; b > c;" self._checkToken( Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
def testCustomFunctionality2(self): # convert a's to b's and b's to c's rules = "c { a > b; a > d;" self._checkToken( Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
def make_analyser(*variants, variant_only=False): rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} if variant_only: rules['mode'] = 'variant-only' config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) return module.create(trans, config)
def test_no_variants(): rules = { 'analyzer': 'generic' } config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) proc = module.create(trans, config) assert get_normalized_variants(proc, '大德!') == ['dà dé']
def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should be used accordingly: ``` with tokenizer.name_analyzer() as analyzer: analyser.tokenize() ``` When used outside the with construct, the caller must ensure to call the close() function before destructing the analyzer. Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
def decompose_nfkd(text): """Perform unicode compatibility decomposition. This will replace some non-standard value representations in unicode and normalise them, while also separating characters and their diacritics into two separate codepoints. """ if text is None: return None if not hasattr(decompose_nfkd, '_tr'): decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD') return decompose_nfkd._tr.transliterate(text)
def latinize_text(text, ascii=False): """Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. """ if text is None or not isinstance(text, six.string_types) or not len(text): return text if ascii: if not hasattr(latinize_text, '_ascii'): # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII latinize_text._ascii = Transliterator.createInstance( 'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII' ) # noqa return latinize_text._ascii.transliterate(text) if not hasattr(latinize_text, '_tr'): latinize_text._tr = Transliterator.createInstance('Any-Latin') return latinize_text._tr.transliterate(text)
def __init__(self, config, phplib_dir, db_connection) -> None: self.db_connection = db_connection self.config = config self.phplib_dir = phplib_dir self.black_list, self.white_list = self._load_white_and_black_lists() #Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])' ) self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION)
def test_get_search_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def test_get_search_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def make_transliterator(script): try: from icu import Transliterator inst = Transliterator.createInstance(script) return inst.transliterate except ImportError: from text_unidecode import unidecode warnings.warn("Install 'pyicu' for better text transliteration.", ICUWarning, stacklevel=4) # noqa def transliterate(text): text = compose_nfkc(text) return unidecode(text) return transliterate
def test_transliteration_rules_from_file(self): self.write_config("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """) transpath = self.project_env.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def test_transliteration_rules_from_file(test_config): cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text( dedent("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """)) transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def make_trans(script: str) -> Callable[[str], Optional[str]]: try: from icu import Transliterator # type: ignore inst = Transliterator.createInstance(script) return cast(Callable[[str], str], inst.transliterate) except ImportError: from text_unidecode import unidecode # type: ignore warnings.warn("Install 'pyicu' for better text transliteration.", ICUWarning, stacklevel=4) # noqa def transliterate(text: str) -> Optional[str]: clean = compose_nfkc(text) if clean is None: return None return cast(Optional[str], unidecode(clean)) return transliterate
def __init__(self, config, phplib_dir, db_connection) -> None: self.db_connection = db_connection self.config = config self.phplib_dir = phplib_dir self.black_list, self.white_list = self._load_white_and_black_lists() #Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION) #This set will contain all existing phrases from the word table which #no longer exist on the wiki. #It contain tuples with the following format: (normalized_word, class, type, operator) self.words_phrases_to_delete = set() #This set will contain the phrases which still exist from the wiki. #It is used to prevent duplicates on the wiki by removing them from #the word_phrases_to_delete only at the end. self.words_phrases_still_exist = set() #This set will contain all existing place_classtype tables which doesn't match any #special phrases class/type on the wiki. self.table_phrases_to_delete = set()
def _compose_nfc(text): if not hasattr(_compose_nfc, '_tr'): _compose_nfc._tr = Transliterator.createInstance('Any-NFC') return _compose_nfc._tr.transliterate(text)
def get_normalized_variants(proc, name): norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) return proc.get_variants_ascii(norm.transliterate(name).strip())
# -*- coding: utf-8 -*- """ Transliterating text to International Phonetic Alphabet (IPA) Using International Components for Unicode (ICU) https://github.com/ovalhub/pyicu """ from icu import Transliterator _ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin") # ถอดเสียงภาษาไทยเป็นอักษรละติน def transliterate(text: str) -> str: """ Use ICU (International Components for Unicode) for transliteration ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน :param str text: Thai text to be transliterated. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced. """ return _ICU_THAI_TO_LATIN.transliterate(text)
import os import re import six import yaml from icu import Transliterator DATA_PAGE = 10000 WS_PATTERN = re.compile('\s+') tr = Transliterator.createInstance('Any-Latin') def resolve_includes(file_path, data): """Handle include statements in the configuration file.""" if isinstance(data, (list, tuple, set)): data = [resolve_includes(file_path, i) for i in data] elif isinstance(data, dict): include_paths = data.pop('include', []) if not isinstance(include_paths, (list, tuple, set)): include_paths = [include_paths] for include_path in include_paths: dir_prefix = os.path.dirname(file_path) include_path = os.path.join(dir_prefix, include_path) data.update(load_config_file(include_path)) for key, value in data.items(): data[key] = resolve_includes(file_path, value) return data def load_config_file(file_path): """Load a YAML (or JSON) model configuration file."""
def __init__(self, norm_rules): self.norm = Transliterator.createFromRules("rule_loader_normalization", norm_rules)
def _getTransliterator(self, name): return Transliterator.createInstance(name, UTransDirection.FORWARD)
def _decompose_nfkd(text): if not hasattr(_decompose_nfkd, '_tr'): _decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD') return _decompose_nfkd._tr.transliterate(text)
def to_latin(string, locale=locale): ustring = UnicodeString(string) nfc = Normalizer2.getNFCInstance() ustring = nfc.normalize(ustring) trans = Transliterator.createFromRules( "", "$wb = [^[:Letter:]] ;" # е "$wb { е > ye ;" "[ыq] { е } $wb > e ;" "[уеёыаоэяиюьъiuoeaq] { е > ye ;" "е > e ;" # э "$wb { э > e ;" "[жшцйjwcy] { э > е ;" "э > qe ;" # ы "[жшцйjwcy] { ы > i ;" "ы > q ;" # ё "$wb { ё > yo ;" "[жшцйjwcy] { ё > o ;" "[уеёыаоэяиюьъiuoeaq] { ё > yo ;" "ё > ho ;" # ю "$wb { ю > yu ;" "[жшцйjwcy] { ю > u ;" "[уеёыаоэяиюьъiuoeaq] { ю > yu ;" "ю > hu ;" # я "$wb { я > ya ;" "[жшцйjwcy] { я > a ;" "[уеёыаоэяиюьъiuoeaq] { я > ya ;" "я > ha ;" # Буквосочетание ьо, только в заимствованных "ньо > nyo ;" "льо > lyo ;" "мьо > myo ;" "рьо > ryo ;" # Остальные буквы "а > a ;" "б > b ;" "в > v ;" "г > g ;" "д > d ;" "ж > j ;" "з > z ;" "и > i ;" "й > y ;" "к > k ;" "л > l ;" "м > m ;" "н > n ;" "о > o ;" "п > p ;" "р > r ;" "с > s ;" "т > t ;" "у > u ;" "ф > f ;" "х > x ;" "ц > c ;" "ч > ch ;" "ш > w ;" "щ > wh ;" # Проход с начала ":: Any-Null ;" "[nlmr] { ь } y[aueioq] > ;" "ь > h ;" "[nlmr] { ъ } y[aueioq] > y;" "ъ > ;" # Проход с начала ":: Any-Null ;" "h+ > h ;") ustring = trans.transliterate(ustring) return ustring
def testCustomFunctionality(self): # convert a's to b's and b's to c's rules = "a > b; b > c;" self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
def test_get_normalization_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_normalization_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
def testCustomFunctionality2(self): # convert a's to b's and b's to c's rules = "c { a > b; a > d;" self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
flags.DEFINE_bool("build_fasttext", False, "build fasttext features") flags.DEFINE_bool("build_tfrecord", False, "build tensorflow record input files") flags.DEFINE_integer("nrows", 100, "The TOP number of rows to query") prog = re.compile("[\\W\\d]", re.UNICODE) prog_with_digits = re.compile("[\\W]", re.UNICODE) stemmer = SnowballStemmer("russian", ignore_stopwords=True) float_prog = re.compile(r"[-+]?\d*\.\d+|\d+", re.UNICODE) dot_prog = re.compile(r'[xх*]', re.UNICODE) TransTable = str.maketrans(dict.fromkeys(r'~/-\[\]()|{}:^+', ' ')) wt = WordTokenizer() trans = Transliterator.createInstance('Latin-Cyrillic') unit_lookup = { 'г': 'грамм', 'грам': 'грамм', 'гр': 'грамм', 'грамм': 'грамм', 'gr': 'грамм', 'ml': 'мл', 'милл': 'мл', 'млитр': 'мл', 'млтр': 'мл', 'мл': 'мл', 'ш': 'шт', 'шт': 'шт', 'тон': 'тонна',