示例#1
0
    def get_L_possible_conversions(self, from_, remove_variant=False):
        # OPEN ISSUE: Add exceptions for e.g. Latin which have
        #             many false positives?
        from_ = ISOTools.remove_unneeded_info(from_)
        L = []
        DScripts = self.get_D_scripts()

        LAdd = [
            VARIANT,
            TERRITORY|VARIANT,
            VARIANT|LANG,
            VARIANT|LANG|TERRITORY
        ] if remove_variant else []

        for s in ISOTools.get_L_removed(
            from_,
            [
                NONE, TERRITORY, LANG,
                TERRITORY|LANG
            ] + LAdd,
            rem_dupes=True
        ):
            if s in DScripts:
                L.extend((s, v) for v in DScripts[s])
        return L
示例#2
0
    def group_by_alphabet(self, search, char_indexes=None):
        """
        The headings are actually provided by the CLDR data directly,
        so if using the alphabet key, grab directly from the original source!
        """
        if char_indexes is None:
            from char_data.CharIndexes import CharIndexes
            char_indexes = CharIndexes(char_data=self)

        lang_data = LangData(search)
        script = ISOTools.split(ISOTools.guess_omitted_info(search)).script

        LRtn = []
        for heading, ranges_string in lang_data.get_L_alpha():
            LOut = []
            for i_s in UnicodeSet(self, char_indexes, ranges_string):
                LOut.extend([ord(i) for i in i_s])
            # LRtn.extend(LOut)

            LRtn.append(('block', (heading, '')))
            LRtn.append(('chars', LOut))
            # LRtn.append((heading, LOut))

        for typ1, typ2, i_L in lang_data.get_L_symbols():
            for heading, chars in i_L:
                if typ2:
                    # ??? What does typ1/typ2 do again??
                    heading = '%s %s' % (typ2, heading)

                if typ1:
                    heading = '%s %s' % (typ1, heading)

                if heading.startswith("latn") and script != 'Latn':
                    # Ignore Latin perMille etc for langauge
                    # which don't use Latin scripts
                    continue

                if heading.startswith('arab') and script != 'Arab':
                    # Ignore arabic group, etc for languages
                    # which don't use the Arabic script
                    continue

                LExtend = [ord(i) for i in chars]
                LRtn.append(('block', (heading, '')))
                LRtn.append(('chars', LExtend))
                # LRtn.extend(LExtend)
                # LRtn.append((heading, LExtend))

        # from pprint import pprint
        # pprint(LRtn)

        # lang_data.get_currency_symbol()
        # lang_data.locale_pattern()
        # lang_data.ellipsis()
        # lang_data.quotes('')
        # lang_data.paranthesis('')
        return LRtn
示例#3
0
    def __init__(self, path, direction='=>'):
        self.direction = direction

        #print 'READ_D_INI'
        D = read_D_html_ini(path)
        #print 'GET DCONFIG'
        DConfig = self.DConfig = json.loads(self.remove_comments(
            D['settings']))
        #print 'GET DVARS'
        self.DVariables = self.get_D_variables(
            self.remove_comments(D.get('variables', '')))

        # HACK HACK HACK!
        DConfig['from_iso'] = ISOTools.remove_unneeded_info(
            DConfig['from_iso'])
        DConfig['to_iso'] = ISOTools.remove_unneeded_info(DConfig['to_iso'])

        # Get case-related properties
        self.ignore_case = DConfig.get('ignore_case', False)
        self.match_case = DConfig.get('match_case', False)

        self.from_iso = (DConfig['from_iso']
                         if direction == '=>' else DConfig['to_iso'])
        self.to_iso = (DConfig['to_iso']
                       if direction == '=>' else DConfig['from_iso'])

        self.LRules = [
            get_rule(self.replace_variables(rule.strip()),
                     reverse=direction == '<=',
                     ci_conditions=self.ignore_case or self.match_case)
            for rule in self.remove_comments(D['conversions']).split('\n')
        ]

        # Do various checks
        assert direction in ('=>', '<=')
        assert not (DConfig.get('direction') == '=>' and direction == '<=')
        assert not (DConfig.get('direction') == '<=' and direction == '=>')
        assert not DConfig.get('ignore_me'), "I've been ignored!"

        # Get the modifiers to use
        DModifiers = (DConfig.get('modifiers', {}).get(
            'from_direction' if direction == '=>' else 'to_direction', {}))
        self.LFromModifiers = DModifiers.get('before_conversions', {})
        self.LToModifiers = DModifiers.get('after_conversions', {})

        self.DRules = self.get_D_rules()

        # Clean up
        del self.LRules
        del self.DVariables
示例#4
0
    def __add_to_two_level_mappings(self):
        # TODO: ADD TO DTwoLevelMappings!!!
        DOut = {}

        for iso in get_L_possible_isos():
            iso_info = ISOTools.split(iso)
            if iso_info.territory:
                from iso_tools.ISOCodes import DCountries
                region = DCountries.get(iso_info.territory, ['Unknown'])[0]  # TODO: ALLOW FOR i18n etc!!!
            else:
                part3 = iso_info.lang
                # OPEN ISSUE: Use LCountry[2] here, to use continent rather than country??
                try:
                    region = ISOCodes.get_D_iso(part3)['LCountry'][1]
                except KeyError:
                    region = 'Unknown'

            DOut.setdefault(region, []).append(iso)

        LOut = []
        for region, LValues in sorted(DOut.items()):
            LOut.append((region, LValues))

        from char_data.data_processors.consts import DTwoLevelMappings
        DTwoLevelMappings['cldr_alphabets.alphabets'] = LOut  # HACK!
示例#5
0
def iso_convert(iso, variant):
    font = DFonts.get(variant)
    item = DScriptsVariants.get(variant, DScriptsVariants.get(font))

    if not item:
        print(('** SCRIPT WARNING:', iso, variant))
        script = variant = None  # WARNING!
    else:
        if '|' in item:
            script, _, variant = item.partition('|')
        else:
            script = item

    if variant and variant.startswith('%s ' % font):
        variant = variant[len(font) + 1:]
    elif variant and font == variant:
        variant = None

    if iso == 'mol':
        iso = 'ro'  # HACK!

    try:
        return ISOTools.join(  #ISOTools.remove_unneeded_info
            part3=iso if not '?' in iso else 'und',
            script=script,
            variant=variant or None)
    except:
        print(('** SCRIPT WARNING 2:', iso, script, variant))
        return 'FIXME: %s,%s' % (iso, variant)
示例#6
0
    def get_D_engines(self):
        # Add internal python transliterators
        # print DTranslitMappings
        D = {}
        with open(data_path('translit', 'ignored_isos.txt'), 'r') as f:
            # HACK: Ignore these (mostly fairly uncommonly used) transliteration systems
            # as they probably have errors/I don't have much time to maintain them
            SIgnoredISOs = f.read().split('\n')

        for from_iso, L in list(self.DTranslitMappings.items()):
            for path, to_iso, direction in L:
                if ISOTools.split(from_iso).lang in SIgnoredISOs:
                    continue
                elif ISOTools.split(to_iso).lang in SIgnoredISOs:
                    continue

                D[from_iso, to_iso] = (path, direction)
        return D
示例#7
0
def get_D_comb():
    D = {}

    for path in glob(data_path('translit_combinations', '*.map')):
        with open(path, 'rb', 'utf-8') as f:
            for line in f:
                if not line.strip() or line[0] == '#':
                    continue

                L = loads(line)
                for iso in L:
                    ISOTools.verify_iso(iso)

                assert len(L) > 1
                assert not (L[0], L[-1]) in D

                D[L[0], L[-1]] = L

    return D
示例#8
0
    def get_L_best_conversions(self, from_iso, to_iso):
        LRtn = []
        from_iso = ISOTools.remove_unneeded_info(from_iso)
        to_iso = ISOTools.remove_unneeded_info(to_iso)

        for xx, (conv_from_iso, conv_to_iso) in enumerate(
            self.get_L_possible_conversions(
                from_iso, remove_variant=True
            )
        ):
            for yy, i_to_iso in enumerate(ISOTools.get_L_removed(
                to_iso,
                [
                    NONE, TERRITORY, LANG,
                    TERRITORY|LANG,
                    VARIANT, TERRITORY|VARIANT, VARIANT|LANG,
                    VARIANT|LANG|TERRITORY
                ],
                rem_dupes=True
            )):
                if i_to_iso == conv_to_iso:
                    len_diff1 = -len([
                        _ for _ in ISOTools.split(conv_from_iso) if _
                    ])
                    len_diff2 = -len([
                        _ for _ in ISOTools.split(conv_to_iso) if _
                    ])

                    LRtn.append((
                        # Note this proritizes items which remove the
                        # VARIANT last, as there's a chance
                        # e.g. there's a variant in the Latin system used
                        (len_diff1, xx), (len_diff2, yy),
                        (conv_from_iso, conv_to_iso)
                    ))

        LRtn.sort()
        return [i[-1] for i in LRtn]
示例#9
0
    def __get_D_engines(self):
        """
        Get a dict of {(from, to): params, ...}
        for all available transliteration engines
        """
        DEngines = {}
        for engine in self.LEngines:
            for from_iso, to_iso in engine.get_L_possible_conversions():
                if (from_iso, to_iso) in DEngines:
                    import warnings
                    warnings.warn(
                        f"Warning: iso combination {from_iso}/"
                        f"{to_iso} has already been assigned"
                    )
                    continue

                DEngines[from_iso, to_iso] = engine

        if True:
            for from_, to in DEngines:
                ISOTools.verify_iso(from_)
                ISOTools.verify_iso(to)

        return DEngines
示例#10
0
    def prettify_lang(self, s, always_show_script=False):
        if not always_show_script and not s in SAlwaysShow:
            from iso_tools.ISOTools import ISOTools
            s = ISOTools.remove_unneeded_info(s)
            #print("PRETTY:", s)

        pr_lang, pr_script, pr_territory, pr_variant = self.get_L_pretty(s)

        return self._locale_pattern(
            pr_lang or 'und',
            [
                pr_script,  # FIXME!
                pr_territory,
                pr_variant
            ])
示例#11
0
    def get_closest_profile(self, iso, default=KeyError):
        """
        Find the closest profile available,
        e.g. "ja-Japn_JP" doesn't exist, so this can fallback to "ja_JP"
        """
        self._ensure_isotools()

        for i_iso in ISOTools.get_L_removed(iso, [
                NONE, TERRITORY, VARIANT, SCRIPT, VARIANT | TERRITORY, SCRIPT
                | TERRITORY, SCRIPT | VARIANT, SCRIPT | TERRITORY | VARIANT
        ],
                                            rem_dupes=True):
            if i_iso in self.DISOToProfiles:
                return i_iso

        if default == KeyError:
            raise KeyError(iso)
        return default
示例#12
0
    def _mapping_to_iso(self, part3, script=None, variant=None, other=None):
        if other:
            if not part3 and other[0] and other[0] != 'ben':
                part3 = other[0]

        r = ISOTools.join(part3, script, variant=variant)

        DMap = {
          'zh_Hani': 'zh',
          'Hani': 'zh',
          'ja_Zyyy': 'ja_Hrkt',
          'zh_Bopo|Zhuyin': 'zh_Bopo',
          'Bopo': 'zh_Bopo',
          'zh_Latn': 'zh_Latn|x-Pinyin'
        }

        if r in DMap:
            r = DMap[r]
        return r
示例#13
0
def get_L_possible_isos():
    import os
    from lang_data.data_paths import data_path
    from iso_tools.ISOTools import ISOTools

    LRtn = []

    for fnam in os.listdir(data_path('cldr', 'main')):
        if fnam.endswith('.xml'):
            if fnam in ('en_US_POSIX.xml', 'el_POLYTON.xml', 'root.xml', 'ar_001.xml', 'es_419.xml'):  # TODO: FIX POLYTONIC GREEK!!
                continue

            try:
                iso = ISOTools.locale_to_iso(fnam.rpartition('.')[0])
                LRtn.append(iso)
            except:
                from warnings import warn
                warn("can't make locale into ISO: %s" % fnam)

    return LRtn
示例#14
0
    def get_L_pretty(self, s):
        """
        get the localized names of the language,
        script, territory+variant (if specified)
        """
        from iso_tools.ISOTools import ISOTools
        lang, script, territory, variant = ISOTools.split(s)

        if lang:
            DISO = ISOCodes.get_D_iso(lang)

        territory_default = DCountries.get(territory, [territory])[0]

        return (self.get_lang_name(
            lang, default=DISO.get('short_name', DISO['long_name'])) if lang
                else None, self.get_script_name(script) if script else None,
                self.get_territory_name(territory, default=territory_default)
                if territory else None,
                self.get_variant_name(variant, default=variant)
                if variant else None)
示例#15
0
from json import load, dump
from iso_tools.ISOTools import ISOTools

DOut = {}

for heading, L in list(load(open('script_headings.json', 'rb')).items()):
    for key in L:
        while key.count('|') != 2:
            key += '|'

        script, lang, variant = key.split('|')
        print(script, lang, variant)

        DOut.setdefault(heading, []).append(ISOTools.join(part3=lang, script=script, variant=variant))

dump(DOut, open('script_headings-2.json', 'wb'), indent=4)
示例#16
0
        #    lang = D['@type']
        #    script = D['@scripts'] if '@scripts' in D \
        #        and not ' ' in D['@scripts'] else script
        #    territory = D['@territories'] if '@territories' in D \
        #        and not ' ' in D['@territories'] else territory

        return self.join(lang, script, territory, variant)


if __name__ == '__main__':
    from iso_tools.ISOTools import ISOTools as i
    from cProfile import run

    print(
        i.get_L_removed(
            'nl_Latn-NL',
            [NONE, SCRIPT, TERRITORY, SCRIPT | TERRITORY],
            #rem_dupes=True
        ))

    print(i.guess_omitted_info('hy'))
    print(i.guess_omitted_info('ko'))
    print(i.guess_omitted_info('zh'))
    print(i.guess_omitted_info('en_Latn|MINE!'))
    print(i.guess_omitted_info('en_Shaw'))

    #run("for x in xrange(50000): i.guess_omitted_info('ja')")
    #for x in xrange(5000):
    #    print i.guess_omitted_info('ja')
示例#17
0
    def get_D_profiles(self):
        """
        Get a map of possible CLDR language profiles
        """
        self._ensure_isotools()

        DRtn = {}
        DISOToProfiles = {}

        for fnam in listdir(data_path('cldr', 'main')):
            if fnam.split('.')[-1] != 'xml' or fnam == 'base.xml':
                continue
            fnam = fnam.replace('.xml', '')

            if fnam in ('in', 'iw', 'mo', 'root'):
                # HACK!
                continue

            script = None
            territory = None
            variant = None

            if '_' in fnam:
                iso = fnam.split('_')[0]
                L = fnam.split('_')[1:]
                for i in L:
                    if i.isupper() and len(i) == 2:
                        # A two-letter territory, e.g. AU
                        territory = i

                    elif i.istitle() and len(i) == 4:
                        # A four-letter script name, e.g. Latn
                        script = i

                    else:
                        # A variant, e.g. "POLYTON" for polytonic Greek
                        variant = i
            else:
                iso = fnam

            # Convert to the standard LanguageLynx ISO string format
            iso_string = ISOTools.join(part3=iso,
                                       script=script,
                                       territory=territory,
                                       variant=variant)
            assert not iso_string in DISOToProfiles
            DISOToProfiles[iso_string] = fnam

            DRtn.setdefault(iso, []).append((script, territory, variant))

        # Chinese is referenced like e.g. "zh_Hans_CN",
        # but "zh_CN" is a common way of referencing
        # the same thing, so alias them
        for alias, fnam in (
            ('zh-CN', 'zh_Hans_CN'),
            ('zh-SG', 'zh_Hans_SG'),

                # Macau/Hong Kong also have simplified profiles,
                # but I assume they're traditional here
            ('zh-HK', 'zh_Hant_HK'),
            ('zh-MO', 'zh_Hant_MO'),  # Macau
            ('zh-TW', 'zh_Hant_TW')):
            DISOToProfiles[alias] = fnam

        return DRtn, DISOToProfiles