Exemplo n.º 1
0
    def test_33_register_unregister(self):
        """
        Testing register/unregister.
        """
        from transliterate.contrib.languages.hy.translit_language_pack import (
            ArmenianLanguagePack)

        class A(TranslitLanguagePack):
            language_code = "ru"
            language_name = "Example"
            mapping = data.test_33_register_unregister_mapping

        # Since key `ru` already exists in the registry it can't be replaced
        # (without force-register).
        res = registry.register(A)
        self.assertTrue(not res)

        # Now with force-register it can.
        res = registry.register(A, force=True)
        self.assertTrue(res)

        # Once we have it there and it's forced, we can't register another.
        res = registry.register(A, force=True)
        self.assertTrue(not res)

        # Unregister non-forced language pack.
        res = registry.unregister(ArmenianLanguagePack)
        self.assertTrue(res and not ArmenianLanguagePack.language_code
                        in get_available_language_codes())

        res = registry.unregister(A)
        self.assertTrue(not res
                        and A.language_code in get_available_language_codes())
Exemplo n.º 2
0
    def test_33_register_unregister(self):
        """
        Testing register/unregister.
        """
        from transliterate.contrib.languages.hy.translit_language_pack import ArmenianLanguagePack

        class A(TranslitLanguagePack):
            language_code = "ru"
            language_name = "Example"
            mapping = (
                u"abcdefghij",
                u"1234567890",
            )
        # Since key `ru` already exists in the registry it can't be replaced (without force-register).
        res = registry.register(A)
        self.assertTrue(not res)

        # Now with force-register it can.
        res = registry.register(A, force=True)
        self.assertTrue(res)

        # Once we have it there and it's forced, we can't register another.
        res = registry.register(A, force=True)
        self.assertTrue(not res)

        # Unregister non-forced language pack.
        res = registry.unregister(ArmenianLanguagePack)
        self.assertTrue(res and not ArmenianLanguagePack.language_code in get_available_language_codes())

        res = registry.unregister(A)
        self.assertTrue(not res and A.language_code in get_available_language_codes())
Exemplo n.º 3
0
def main():
    print "Csv2ldif started.."
    print get_available_language_codes()
    reader = unicode_csv_reader(open("users.csv"), delimiter=';')
    i = iter(reader)
    # skip csv header
    i.next()
    for row in i:
        user = {}
        org = row[0]
        dep = row[1]
        last_name = row[2]
        first_name = row[3]
        sure_name = row[4]
        password = random_password()
        uid = get_uid(last_name, first_name)
        cn = get_common_name(last_name, first_name, sure_name)
        dn_str = u"CN={},OU={},OU={},OU=Пользователи,DC=edu,dc=knu,dc=kg".format(
            cn, org, dep)
        user["cn"] = encode(cn)
        user["dn"] = encode(dn_str)
        user["sn"] = encode(last_name)
        user["givenName"] = encode(first_name)
        user["displayName"] = encode(u"{} {} {}".format(
            last_name, first_name, sure_name))
        user["name"] = encode("user_name")
        user["uid"] = uid
        user["unicodePwd"] = encode(password)
        template = Template(open("template.jinja2").read())
        print template.render(user)
    print "Csv2ldif finished."
Exemplo n.º 4
0
def translate_text_table_to_en(text_table):
    language_code = text_table.language_code
    if language_code not in transliterate.get_available_language_codes():
        raise ValueError(
            "Language code " + language_code +
            " not found in supported transliteration tables " +
            ",".join(transliterate.get_available_language_codes()))

    return TextTable(
        [(text_id, translate_text_to_en(text,
                                        text_table.language_code), text_type)
         for text_id, text, text_type in text_table.texts], 'en')
Exemplo n.º 5
0
def transliterate(first_name, last_name, full_name_native, search_helper):
    vocab = set(
        first_name.split(' ') + last_name.split(' ') +
        full_name_native.split(' ') + search_helper.split(' '))
    langs = get_available_language_codes()
    ascii_values = []
    translate_table = {
        0xe4: ord('a'),
        0xc4: ord('A'),
        0xf6: ord('o'),
        0xd6: ord('O'),
        0xfc: ord('u'),
        0xdc: ord('U'),
    }

    for name in vocab:
        name = name.decode('utf-8')
        ascii_values.append(unidecode(name))
        for lang in langs:
            try:
                ascii_values.append(str(translit(name, lang, reversed=True)))
            except UnicodeEncodeError:
                # if we encounter other characters = other languages
                # than German
                pass
        try:
            ascii_values.append(
                str(name.replace(u'\xdf', 'ss').translate(translate_table)))
        except UnicodeEncodeError:
            # if we encounter other characters = other languages than German
            pass
    return ' '.join(sorted(set(ascii_values))).strip()
Exemplo n.º 6
0
def transliterate(first_name, last_name, full_name_native, search_helper):
    vocab = set(first_name.split(' ') + last_name.split(' ') +
                full_name_native.split(' ') + search_helper.split(' '))
    langs = get_available_language_codes()
    ascii_values = []
    translate_table = {
        0xe4: ord('a'),
        0xc4: ord('A'),
        0xf6: ord('o'),
        0xd6: ord('O'),
        0xfc: ord('u'),
        0xdc: ord('U'),
        }

    for name in vocab:
        name = name.decode('utf-8')
        ascii_values.append(unidecode(name))
        for lang in langs:
            try:
                ascii_values.append(
                    str(translit(name, lang, reversed=True)))
            except UnicodeEncodeError:
                # if we encounter other characters = other languages
                # than German
                pass
        try:
            ascii_values.append(
                str(name.replace(u'\xdf', 'ss').translate(translate_table)))
        except UnicodeEncodeError:
            # if we encounter other characters = other languages than German
            pass
    return ' '.join(sorted(set(ascii_values))).strip()
Exemplo n.º 7
0
def parse_args():
    lens = get_available_language_codes()

    parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description=\
    "Usage: python wordtranny.py <OPTIONS> \n" + "Available Languages: \n" + colors.red + str(lens) + colors.normal)

    menu_group = parser.add_argument_group(colors.lightblue + 'Menu Options' +
                                           colors.normal)

    menu_group.add_argument('-f',
                            '--file',
                            help="wordlist to open",
                            required=True)

    menu_group.add_argument('-l',
                            '--language',
                            help="language to convert to",
                            required=True)

    menu_group.add_argument('-o',
                            '--outfile',
                            help="outfile to write the converted text to",
                            required=True)

    args = parser.parse_args()

    output = None

    return args, output
Exemplo n.º 8
0
def get_words(lang, file_obj):
    """
    reads a dictionary file-object in hunspell format (utf-8 version)

    returns a cleaned up set of words
    """
    words = set()

    lines = file_obj.readlines()[1:]
    for line in lines:
        # decode and remove comments
        line = line.decode('utf-8').partition('/')[0]
        # strip line from unwanted stuff
        line = line.strip()
        # transliterate line if needed
        if lang in get_available_language_codes():
            line = translit(line, lang, reversed=True)
        # discard lines containing superscript/subscript
        if any(x in line for x in '⁰¹²³⁴⁵⁶⁷⁸⁹'):
            continue
        # discard lines containing non-alpha characters and with non-normal capitalization (acronyms...)
        if not line.isalpha() or (len(line) > 1 and not line[1:].islower()):
            continue
        words.add(line)

    # need to transform in list be able to save it as json
    words.discard('')
    words = list(words)

    return words
Exemplo n.º 9
0
 def test_01_get_available_language_codes(self):
     """Test ``autodiscover`` and ``get_available_language_codes``."""
     res = get_available_language_codes()
     res.sort()
     c = ['el', 'hy', 'ka', 'ru', 'uk', 'bg', 'mk', 'mn']
     c.sort()
     self.assertEqual(res, c)
     return res
Exemplo n.º 10
0
def process_args(args):

    if sys.version_info[0] >= 3:
        ofp = codecs.getwriter('utf8')(sys.stdout.buffer)
    else:
        ofp = codecs.getwriter('utf8')(sys.stdout)

    # last language used
    lang = None
    # last language pack used
    translit = None

    for line in read_stdin():
        try:

            obj = json.loads(line)
            text = obj['text']
            source = obj['source']
            target = obj['target']

            # { "source" : "ru", "target" : "en", "text": "до свидания" }
            # { "source" : "en", "target": "ru", "text": "do svidanija" }
            reversed = True if target == 'en' else False
            lang = source if reversed is True else target

            translit = get_translit_function(lang)  # ru
            tline = translit(text, reversed=reversed)  # True

            # result json
            obj['lang'] = lang
            obj['reversed'] = reversed
            obj['trans'] = tline
            res = json.dumps(obj, ensure_ascii=False).encode('utf8')

        except Exception as ex:
            obj = {}
            obj['error'] = str(ex)
            obj['description'] = get_available_language_codes()
            res = json.dumps(obj, ensure_ascii=False).encode('utf8')

        # write to stdout
        try:
            # {"source": "ru", "trans": "Lorem ipsum dolor sit amet", "target": "en", "text": "Лорем ипсум долор сит амет"}
            ofp.writelines([res, '\n'])
            sys.stdout.flush()
        except Exception:
            pass
        except KeyboardInterrupt:
            # close files when process ends
            ifp.close()
            ofp.close()
            # gracefully exit
            sys.exit(0)

    # close files
    ofp.close()
Exemplo n.º 11
0
 def test_01_get_available_language_codes(self):
     """
     Test ``autodiscover`` and ``get_available_language_codes``.
     """
     res = get_available_language_codes()
     res.sort()
     c = ['el', 'hy', 'ka', 'ru', 'uk'] #'he',
     c.sort()
     self.assertEqual(res, c)
     return res
Exemplo n.º 12
0
def transliterateField(request):

    fieldData = request.GET.get('fieldData')
    # print(translit(fieldData, 'en'))
    print(get_available_language_codes())
    print(translit(fieldData, 'ukrTranslit'))
    fieldData = fieldData.lower().title()
    data = {
        'transliteration': translit(fieldData, 'ukrTranslit'),
    }
    return JsonResponse(data)
Exemplo n.º 13
0
def index_primary(primary, path_dest):
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    transliterate_languages = transliterate.get_available_language_codes()
    doc = dominate.document(title=u'AWOL Index: Top-Level Resources')
    with doc.head:
        link(rel='stylesheet',
             type='text/css',
             href=
             'http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css')
        link(rel='stylesheet',
             type='text/css',
             href=
             'http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css')
        link(
            rel='stylesheet',
            type='text/css',
            href=
            'http://fonts.googleapis.com/css?family=Open+Sans:400italic,600italic,700italic,400,600,700&amp;subset=latin,cyrillic-ext,greek-ext,greek,latin-ext,cyrillic'
        )
        link(rel='stylesheet', type='text/css', href='./index-style.css')
    doc += h1('Index of Top-Level Resources')
    _p = p('part of ', cls='subtitle')
    _p += a('The AWOL Index', href='../index.html')
    doc += _p
    _ul = doc.add(ul())
    for pri in primary:
        rtitle = pri['title']
        sort_key = rtitle.lower()
        if sort_key != unicode(sort_key.encode('ascii', 'ignore')):
            classification = langid.classify(sort_key)
            if classification[1] > 0.9:
                if classification[0] in transliterate_languages:
                    sort_key = transliterate.translit(sort_key,
                                                      classification[0],
                                                      reversed=True)
                    sort_key = unicodedata.normalize('NFKD', sort_key)
            sort_key = codecs.encode(sort_key, 'translit/long')
            sort_key = unidecode(sort_key)
            sort_key = sort_key.encode('ascii', 'ignore')
            if len(sort_key) == 0:
                sort_key = rtitle.lower()
        sort_key = RX_PUNCT.sub(u'', sort_key)
        sort_key = u''.join(sort_key.strip().split()).lower()
        logger.debug(u'sortkey for title "{0}": "{1}"'.format(
            rtitle, sort_key))
        pri['sort_key'] = sort_key

    for pri in sorted([
            pri for pri in primary if ' ' not in pri['domain']
            and pri['title'] != u'' and pri['sort_key'] != u''
    ],
                      key=lambda k: k['sort_key']):
        _li = list_entry(_ul, pri)
    html_out(doc, os.path.join(path_dest, 'index-top.html'))
Exemplo n.º 14
0
def load_texts(map_container, text_index_table):
    default_text_table = text_index_table.get_text_table()
    default_locale = default_text_table.language_code

    localizations = [default_text_table]

    if default_text_table.language_code in transliterate.get_available_language_codes(
    ):
        localizations.append(translate_text_table_to_en(default_text_table))

    map_container.texts = localizations
    map_container.meta.locales = [x.language_code for x in localizations]
    map_container.meta.default_locale = default_locale
Exemplo n.º 15
0
def transliteration(blob: str, source_langs: Union[List[str], str], target_lang: str = "la"):
    if blob is None:
        return None
    text: str = blob
    available_langs = get_available_language_codes()
    if isinstance(source_langs, str):
        source_langs = [source_langs]
    for lang in source_langs:
        if lang in available_langs:
            text = translit(text, lang, reversed=True)
        else:
            text = transliteration_slow(blob, target_lang)
    return text
Exemplo n.º 16
0
 def _method(
         lng: str = 'en', qnt: int = 1,
         underscore: bool = False) -> str:
     row_words = rv.random_words(count=qnt)
     char = ' '
     if underscore:
         char = '_'
     result = char.join(row_words)
     if lng == 'en':
         return result
     elif lng in get_available_language_codes():
         return translit(result, lng)
     else:
         return 'wrong lng'
Exemplo n.º 17
0
 def _transliterate(self, input_lang):
     # convert from russian to translit
     try:
         if input_lang == '':
             input_lang = detect(self._input_text)
         if input_lang not in get_available_language_codes():
             input_lang = 'ru'
         self._output_text = translit(self._input_text,
                                      input_lang,
                                      reversed=True)
     except OSError as e:
         if e.errno != errno.EEXIST:
             raise
     pass
Exemplo n.º 18
0
    def test_15_register_custom_language_pack(self):
        """Test registering of a custom language pack."""
        class ExampleLanguagePack(TranslitLanguagePack):
            """Example language pack."""

            language_code = "example"
            language_name = "Example"
            mapping = data.test_15_register_custom_language_pack_mapping

        registry.register(ExampleLanguagePack)

        assert 'example' in get_available_language_codes()
        res = translit(self.latin_text, 'example')
        self.assertEqual(res, 'Lor5m 9psum 4olor s9t 1m5t')
        return res
Exemplo n.º 19
0
class EuropeanTransliterator(TransliteratorBase):
    LANGS = set(euro_transliterate.get_available_language_codes() + ['en'])

    def __init__(self, src_lang, dest_lang):
        self.src_lang = src_lang
        self.dest_lang = dest_lang

    def transliterate(self, phrase):
        en_phrase = phrase if self.src_lang == 'en' else euro_transliterate.translit(
            phrase, self.src_lang, reversed=True)
        return euro_transliterate.translit(en_phrase, self.dest_lang)

    def reverse_transliterate(self, phrase):
        en_phrase = phrase if self.dest_lang == 'en' else euro_transliterate.translit(
            phrase, self.dest_lang, reversed=True)
        return euro_transliterate.translit(en_phrase, self.src_lang)
Exemplo n.º 20
0
    def transliterate(lang,
                      value,
                      reversed=True,
                      stripnonwords=True,
                      replacespaced=True):
        from transliterate import translit, get_available_language_codes

        if lang in get_available_language_codes() and reversed:
            value = translit(value, lang, reversed=reversed)

        if stripnonwords:
            value = re.sub(r'[^\s\w\d-]', '', value)

        if replacespaced:
            value = re.sub(r'\s+', '-', value)

        return value.lower()
Exemplo n.º 21
0
    def test_15_register_custom_language_pack(self):
        """
        Testing registering of a custom language pack.
        """
        class ExampleLanguagePack(TranslitLanguagePack):
            """
            Example language pack.
            """
            language_code = "example"
            language_name = "Example"
            mapping = data.test_15_register_custom_language_pack_mapping

        registry.register(ExampleLanguagePack)

        assert 'example' in get_available_language_codes()
        res = translit(self.latin_text, 'example')
        self.assertEqual(res, 'Lor5m 9psum 4olor s9t 1m5t')
        return res
Exemplo n.º 22
0
def main():
    if (len(sys.argv) < 3):
        print(USAGE)
        return (1)
    filename = sys.argv[1]
    langcodes = sys.argv[2:] 
    tokens = tokenize(filename)
    langlist = []
    for lang in langcodes:
        templist = []
        for token in tokens:
            word = translate(token, lang, 'en').lower()
            if lang in get_available_language_codes():
                word = translit(word, lang, reversed=True)
            templist.append(word)
        langlist.append(templist)
    print(langlist)

    lettergroups = []
Exemplo n.º 23
0
def get_words(file, lang):
    """
    reads a dictionary file in hunspell format (utf-8 version)

    returns a set of words
    """
    words = set()

    # skip first line, header
    lines = file.readlines()[1:]
    for line in lines:
        # decode and remove comments
        line = line.decode('utf-8').partition('/')[0]
        # transliterate line if needed
        if lang.split('-')[0] in get_available_language_codes():
            line = translit(line, lang, reversed=True)
        # only use non-empty lines
        if line:
            words.add(line)

    return words
Exemplo n.º 24
0
def index_primary(primary, path_dest):
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    transliterate_languages = transliterate.get_available_language_codes()
    doc = dominate.document(title=u'AWOL Index: Top-Level Resources')
    with doc.head:
        link(rel='stylesheet', type='text/css', href='http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css')
        link(rel='stylesheet', type='text/css', href='http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css')
        link(rel='stylesheet', type='text/css', href='http://fonts.googleapis.com/css?family=Open+Sans:400italic,600italic,700italic,400,600,700&amp;subset=latin,cyrillic-ext,greek-ext,greek,latin-ext,cyrillic')
        link(rel='stylesheet', type='text/css', href='./index-style.css')
    doc += h1('Index of Top-Level Resources')
    _p = p('part of ', cls='subtitle')
    _p += a('The AWOL Index', href='../index.html')
    doc += _p
    _ul = doc.add(ul())
    for pri in primary:
        rtitle = pri['title']
        sort_key = rtitle.lower()
        if sort_key != unicode(sort_key.encode('ascii', 'ignore')):
            classification = langid.classify(sort_key)
            if classification[1] > 0.9:
                if classification[0] in transliterate_languages:
                    sort_key = transliterate.translit(sort_key, classification[0], reversed=True)
                    sort_key = unicodedata.normalize('NFKD', sort_key)
            sort_key = codecs.encode(sort_key, 'translit/long')
            sort_key = unidecode(sort_key)
            sort_key = sort_key.encode('ascii', 'ignore')
            if len(sort_key) == 0:
                sort_key = rtitle.lower()
        sort_key = RX_PUNCT.sub(u'', sort_key)
        sort_key = u''.join(sort_key.strip().split()).lower()
        logger.debug(u'sortkey for title "{0}": "{1}"'.format(rtitle, sort_key))
        pri['sort_key'] = sort_key

    for pri in sorted([pri for pri in primary if ' ' not in pri['domain'] and pri['title'] != u'' and pri['sort_key'] != u''], key=lambda k: k['sort_key']):
        _li = list_entry(_ul, pri)
    html_out(doc, os.path.join(path_dest, 'index-top.html'))
Exemplo n.º 25
0
 def __init__(self, bot):
     self.bot = bot
     self.avail = transliterate.get_available_language_codes()
Exemplo n.º 26
0
# Autodiscover available language packs
#autodiscover()

print '\nOriginal text\n---------------------------------------'
text = "Lorem ipsum dolor sit amet"
print text

print '\nTransliteration to Armenian\n---------------------------------------'
print translit(text, 'hy')

print '\nTransliteration to Russian\n---------------------------------------'
print translit(text, 'ru')

print '\nList of available (registered) languages\n---------------------------------------'
print get_available_language_codes()

print '\nReversed transliteration from Armenian\n---------------------------------------'
print translit(u'Լօրեմ իպսում դoլoր սիտ ամետ', 'hy', reversed=True)

print '\nReversed transliteration from Russian\n---------------------------------------'
print translit(u'Лорем ипсум долор сит амет', 'ru', reversed=True)

print '\nTesting the function decorator\n---------------------------------------'
from transliterate.decorators import transliterate_function


@transliterate_function(language_code='hy')
def decorator_test_armenian(text):
    return text
Exemplo n.º 27
0
 def __init__(self):
     self.engine = transliterate.translit
     self.languages = transliterate.get_available_language_codes()
Exemplo n.º 28
0
        # u"Ia": u"Я",
        u"Yu": u"Ю",
        u"Ya": u"Я",
    }

    reversed_specific_pre_processor_mapping = {
        u"ъ": u"",
        u"ь": u"",
        u"Ъ": u"",
        u"Ь": u""
    }


registry.register(Gost2006RuLangPack, force=True)

LANG_PADDING = max([len(lang) for lang in get_available_language_codes()])


@lru_cache
def transliterate(text, lang='ru-gost', reversed=True):
    trans = translit(text, lang, reversed)
    print(f"translit[{lang:{LANG_PADDING}}]: {text} => {trans}")

    if trans.isascii():
        return trans
    else:
        for i in trans:
            if not i.isascii():
                print(i, i.isascii())
        raise ValueError(
            f"Incorrect transliteration table for '{lang}' language")
Exemplo n.º 29
0
def sanitize_name(column_name: str):
    name = transliteration(column_name, get_available_language_codes())
    if reserved_or_unsupported(name):
        return f"_{name}"
    return name
Exemplo n.º 30
0
# Autodiscover available language packs
#autodiscover()

print '\nOriginal text\n---------------------------------------'
text = "Lorem ipsum dolor sit amet"
print text

print '\nTransliteration to Armenian\n---------------------------------------'
print translit(text, 'hy')

print '\nTransliteration to Russian\n---------------------------------------'
print translit(text, 'ru')

print '\nList of available (registered) languages\n---------------------------------------'
print get_available_language_codes()

print '\nReversed transliteration from Armenian\n---------------------------------------'
print translit(u'Լօրեմ իպսում դoլoր սիտ ամետ', 'hy', reversed=True)

print '\nReversed transliteration from Russian\n---------------------------------------'
print translit(u'Лорем ипсум долор сит амет', 'ru', reversed=True)

print '\nTesting the function decorator\n---------------------------------------'
from transliterate.decorators import transliterate_function

@transliterate_function(language_code='hy')
def decorator_test_armenian(text):
    return text

print decorator_test_armenian(u"Lorem ipsum dolor sit amet")
Exemplo n.º 31
0
        u'ц': u'ts',
        u'ч': u'ch',
        u'ш': u'sh',
        u'щ': u'sch',
        u'ъ': u'',
        u'ы': u'y',
        u'ь': u'',
        u'э': u'e',
        u'ю': u'yu',
        u'я': u'ya',
    }


registry.register(ExampleLanguagePack, force=True)

print(get_available_language_codes())

# ['el', 'hy', 'ka', 'ru', 'example']
text = '40 лет Октября'


def transliterate(text):
    trans = translit(text, 'example')
    print(trans)
    return trans


if __name__ == "__main__":
    transliterate('Коммунистическая')
# Lor5m 9psum 4olor s9t 1m5t