def get_L_possible_conversions(self, from_, remove_variant=False): # OPEN ISSUE: Add exceptions for e.g. Latin which have # many false positives? from_ = ISOTools.remove_unneeded_info(from_) L = [] DScripts = self.get_D_scripts() LAdd = [ VARIANT, TERRITORY|VARIANT, VARIANT|LANG, VARIANT|LANG|TERRITORY ] if remove_variant else [] for s in ISOTools.get_L_removed( from_, [ NONE, TERRITORY, LANG, TERRITORY|LANG ] + LAdd, rem_dupes=True ): if s in DScripts: L.extend((s, v) for v in DScripts[s]) return L
def group_by_alphabet(self, search, char_indexes=None): """ The headings are actually provided by the CLDR data directly, so if using the alphabet key, grab directly from the original source! """ if char_indexes is None: from char_data.CharIndexes import CharIndexes char_indexes = CharIndexes(char_data=self) lang_data = LangData(search) script = ISOTools.split(ISOTools.guess_omitted_info(search)).script LRtn = [] for heading, ranges_string in lang_data.get_L_alpha(): LOut = [] for i_s in UnicodeSet(self, char_indexes, ranges_string): LOut.extend([ord(i) for i in i_s]) # LRtn.extend(LOut) LRtn.append(('block', (heading, ''))) LRtn.append(('chars', LOut)) # LRtn.append((heading, LOut)) for typ1, typ2, i_L in lang_data.get_L_symbols(): for heading, chars in i_L: if typ2: # ??? What does typ1/typ2 do again?? heading = '%s %s' % (typ2, heading) if typ1: heading = '%s %s' % (typ1, heading) if heading.startswith("latn") and script != 'Latn': # Ignore Latin perMille etc for langauge # which don't use Latin scripts continue if heading.startswith('arab') and script != 'Arab': # Ignore arabic group, etc for languages # which don't use the Arabic script continue LExtend = [ord(i) for i in chars] LRtn.append(('block', (heading, ''))) LRtn.append(('chars', LExtend)) # LRtn.extend(LExtend) # LRtn.append((heading, LExtend)) # from pprint import pprint # pprint(LRtn) # lang_data.get_currency_symbol() # lang_data.locale_pattern() # lang_data.ellipsis() # lang_data.quotes('') # lang_data.paranthesis('') return LRtn
def __init__(self, path, direction='=>'): self.direction = direction #print 'READ_D_INI' D = read_D_html_ini(path) #print 'GET DCONFIG' DConfig = self.DConfig = json.loads(self.remove_comments( D['settings'])) #print 'GET DVARS' self.DVariables = self.get_D_variables( self.remove_comments(D.get('variables', ''))) # HACK HACK HACK! DConfig['from_iso'] = ISOTools.remove_unneeded_info( DConfig['from_iso']) DConfig['to_iso'] = ISOTools.remove_unneeded_info(DConfig['to_iso']) # Get case-related properties self.ignore_case = DConfig.get('ignore_case', False) self.match_case = DConfig.get('match_case', False) self.from_iso = (DConfig['from_iso'] if direction == '=>' else DConfig['to_iso']) self.to_iso = (DConfig['to_iso'] if direction == '=>' else DConfig['from_iso']) self.LRules = [ get_rule(self.replace_variables(rule.strip()), reverse=direction == '<=', ci_conditions=self.ignore_case or self.match_case) for rule in self.remove_comments(D['conversions']).split('\n') ] # Do various checks assert direction in ('=>', '<=') assert not (DConfig.get('direction') == '=>' and direction == '<=') assert not (DConfig.get('direction') == '<=' and direction == '=>') assert not DConfig.get('ignore_me'), "I've been ignored!" # Get the modifiers to use DModifiers = (DConfig.get('modifiers', {}).get( 'from_direction' if direction == '=>' else 'to_direction', {})) self.LFromModifiers = DModifiers.get('before_conversions', {}) self.LToModifiers = DModifiers.get('after_conversions', {}) self.DRules = self.get_D_rules() # Clean up del self.LRules del self.DVariables
def __add_to_two_level_mappings(self): # TODO: ADD TO DTwoLevelMappings!!! DOut = {} for iso in get_L_possible_isos(): iso_info = ISOTools.split(iso) if iso_info.territory: from iso_tools.ISOCodes import DCountries region = DCountries.get(iso_info.territory, ['Unknown'])[0] # TODO: ALLOW FOR i18n etc!!! else: part3 = iso_info.lang # OPEN ISSUE: Use LCountry[2] here, to use continent rather than country?? try: region = ISOCodes.get_D_iso(part3)['LCountry'][1] except KeyError: region = 'Unknown' DOut.setdefault(region, []).append(iso) LOut = [] for region, LValues in sorted(DOut.items()): LOut.append((region, LValues)) from char_data.data_processors.consts import DTwoLevelMappings DTwoLevelMappings['cldr_alphabets.alphabets'] = LOut # HACK!
def iso_convert(iso, variant): font = DFonts.get(variant) item = DScriptsVariants.get(variant, DScriptsVariants.get(font)) if not item: print(('** SCRIPT WARNING:', iso, variant)) script = variant = None # WARNING! else: if '|' in item: script, _, variant = item.partition('|') else: script = item if variant and variant.startswith('%s ' % font): variant = variant[len(font) + 1:] elif variant and font == variant: variant = None if iso == 'mol': iso = 'ro' # HACK! try: return ISOTools.join( #ISOTools.remove_unneeded_info part3=iso if not '?' in iso else 'und', script=script, variant=variant or None) except: print(('** SCRIPT WARNING 2:', iso, script, variant)) return 'FIXME: %s,%s' % (iso, variant)
def get_D_engines(self): # Add internal python transliterators # print DTranslitMappings D = {} with open(data_path('translit', 'ignored_isos.txt'), 'r') as f: # HACK: Ignore these (mostly fairly uncommonly used) transliteration systems # as they probably have errors/I don't have much time to maintain them SIgnoredISOs = f.read().split('\n') for from_iso, L in list(self.DTranslitMappings.items()): for path, to_iso, direction in L: if ISOTools.split(from_iso).lang in SIgnoredISOs: continue elif ISOTools.split(to_iso).lang in SIgnoredISOs: continue D[from_iso, to_iso] = (path, direction) return D
def get_D_comb(): D = {} for path in glob(data_path('translit_combinations', '*.map')): with open(path, 'rb', 'utf-8') as f: for line in f: if not line.strip() or line[0] == '#': continue L = loads(line) for iso in L: ISOTools.verify_iso(iso) assert len(L) > 1 assert not (L[0], L[-1]) in D D[L[0], L[-1]] = L return D
def get_L_best_conversions(self, from_iso, to_iso): LRtn = [] from_iso = ISOTools.remove_unneeded_info(from_iso) to_iso = ISOTools.remove_unneeded_info(to_iso) for xx, (conv_from_iso, conv_to_iso) in enumerate( self.get_L_possible_conversions( from_iso, remove_variant=True ) ): for yy, i_to_iso in enumerate(ISOTools.get_L_removed( to_iso, [ NONE, TERRITORY, LANG, TERRITORY|LANG, VARIANT, TERRITORY|VARIANT, VARIANT|LANG, VARIANT|LANG|TERRITORY ], rem_dupes=True )): if i_to_iso == conv_to_iso: len_diff1 = -len([ _ for _ in ISOTools.split(conv_from_iso) if _ ]) len_diff2 = -len([ _ for _ in ISOTools.split(conv_to_iso) if _ ]) LRtn.append(( # Note this proritizes items which remove the # VARIANT last, as there's a chance # e.g. there's a variant in the Latin system used (len_diff1, xx), (len_diff2, yy), (conv_from_iso, conv_to_iso) )) LRtn.sort() return [i[-1] for i in LRtn]
def __get_D_engines(self): """ Get a dict of {(from, to): params, ...} for all available transliteration engines """ DEngines = {} for engine in self.LEngines: for from_iso, to_iso in engine.get_L_possible_conversions(): if (from_iso, to_iso) in DEngines: import warnings warnings.warn( f"Warning: iso combination {from_iso}/" f"{to_iso} has already been assigned" ) continue DEngines[from_iso, to_iso] = engine if True: for from_, to in DEngines: ISOTools.verify_iso(from_) ISOTools.verify_iso(to) return DEngines
def prettify_lang(self, s, always_show_script=False): if not always_show_script and not s in SAlwaysShow: from iso_tools.ISOTools import ISOTools s = ISOTools.remove_unneeded_info(s) #print("PRETTY:", s) pr_lang, pr_script, pr_territory, pr_variant = self.get_L_pretty(s) return self._locale_pattern( pr_lang or 'und', [ pr_script, # FIXME! pr_territory, pr_variant ])
def get_closest_profile(self, iso, default=KeyError): """ Find the closest profile available, e.g. "ja-Japn_JP" doesn't exist, so this can fallback to "ja_JP" """ self._ensure_isotools() for i_iso in ISOTools.get_L_removed(iso, [ NONE, TERRITORY, VARIANT, SCRIPT, VARIANT | TERRITORY, SCRIPT | TERRITORY, SCRIPT | VARIANT, SCRIPT | TERRITORY | VARIANT ], rem_dupes=True): if i_iso in self.DISOToProfiles: return i_iso if default == KeyError: raise KeyError(iso) return default
def _mapping_to_iso(self, part3, script=None, variant=None, other=None): if other: if not part3 and other[0] and other[0] != 'ben': part3 = other[0] r = ISOTools.join(part3, script, variant=variant) DMap = { 'zh_Hani': 'zh', 'Hani': 'zh', 'ja_Zyyy': 'ja_Hrkt', 'zh_Bopo|Zhuyin': 'zh_Bopo', 'Bopo': 'zh_Bopo', 'zh_Latn': 'zh_Latn|x-Pinyin' } if r in DMap: r = DMap[r] return r
def get_L_possible_isos(): import os from lang_data.data_paths import data_path from iso_tools.ISOTools import ISOTools LRtn = [] for fnam in os.listdir(data_path('cldr', 'main')): if fnam.endswith('.xml'): if fnam in ('en_US_POSIX.xml', 'el_POLYTON.xml', 'root.xml', 'ar_001.xml', 'es_419.xml'): # TODO: FIX POLYTONIC GREEK!! continue try: iso = ISOTools.locale_to_iso(fnam.rpartition('.')[0]) LRtn.append(iso) except: from warnings import warn warn("can't make locale into ISO: %s" % fnam) return LRtn
def get_L_pretty(self, s): """ get the localized names of the language, script, territory+variant (if specified) """ from iso_tools.ISOTools import ISOTools lang, script, territory, variant = ISOTools.split(s) if lang: DISO = ISOCodes.get_D_iso(lang) territory_default = DCountries.get(territory, [territory])[0] return (self.get_lang_name( lang, default=DISO.get('short_name', DISO['long_name'])) if lang else None, self.get_script_name(script) if script else None, self.get_territory_name(territory, default=territory_default) if territory else None, self.get_variant_name(variant, default=variant) if variant else None)
from json import load, dump from iso_tools.ISOTools import ISOTools DOut = {} for heading, L in list(load(open('script_headings.json', 'rb')).items()): for key in L: while key.count('|') != 2: key += '|' script, lang, variant = key.split('|') print(script, lang, variant) DOut.setdefault(heading, []).append(ISOTools.join(part3=lang, script=script, variant=variant)) dump(DOut, open('script_headings-2.json', 'wb'), indent=4)
# lang = D['@type'] # script = D['@scripts'] if '@scripts' in D \ # and not ' ' in D['@scripts'] else script # territory = D['@territories'] if '@territories' in D \ # and not ' ' in D['@territories'] else territory return self.join(lang, script, territory, variant) if __name__ == '__main__': from iso_tools.ISOTools import ISOTools as i from cProfile import run print( i.get_L_removed( 'nl_Latn-NL', [NONE, SCRIPT, TERRITORY, SCRIPT | TERRITORY], #rem_dupes=True )) print(i.guess_omitted_info('hy')) print(i.guess_omitted_info('ko')) print(i.guess_omitted_info('zh')) print(i.guess_omitted_info('en_Latn|MINE!')) print(i.guess_omitted_info('en_Shaw')) #run("for x in xrange(50000): i.guess_omitted_info('ja')") #for x in xrange(5000): # print i.guess_omitted_info('ja')
def get_D_profiles(self): """ Get a map of possible CLDR language profiles """ self._ensure_isotools() DRtn = {} DISOToProfiles = {} for fnam in listdir(data_path('cldr', 'main')): if fnam.split('.')[-1] != 'xml' or fnam == 'base.xml': continue fnam = fnam.replace('.xml', '') if fnam in ('in', 'iw', 'mo', 'root'): # HACK! continue script = None territory = None variant = None if '_' in fnam: iso = fnam.split('_')[0] L = fnam.split('_')[1:] for i in L: if i.isupper() and len(i) == 2: # A two-letter territory, e.g. AU territory = i elif i.istitle() and len(i) == 4: # A four-letter script name, e.g. Latn script = i else: # A variant, e.g. "POLYTON" for polytonic Greek variant = i else: iso = fnam # Convert to the standard LanguageLynx ISO string format iso_string = ISOTools.join(part3=iso, script=script, territory=territory, variant=variant) assert not iso_string in DISOToProfiles DISOToProfiles[iso_string] = fnam DRtn.setdefault(iso, []).append((script, territory, variant)) # Chinese is referenced like e.g. "zh_Hans_CN", # but "zh_CN" is a common way of referencing # the same thing, so alias them for alias, fnam in ( ('zh-CN', 'zh_Hans_CN'), ('zh-SG', 'zh_Hans_SG'), # Macau/Hong Kong also have simplified profiles, # but I assume they're traditional here ('zh-HK', 'zh_Hant_HK'), ('zh-MO', 'zh_Hant_MO'), # Macau ('zh-TW', 'zh_Hant_TW')): DISOToProfiles[alias] = fnam return DRtn, DISOToProfiles