Exemplo n.º 1
0
def tr(text, scr, titled=True, fontize=False):
    if scr == 'hk':
        scr = 'harvardkyoto'
    if text == '':
        return ''

    text_bits = text.split('|')
    transliterated_text = []

    if titled:
        for t in text_bits:
            t = t.rstrip('~0123456789 ')
            if t[:3] == 'ta:':
                # Force Tamil!
                t = t[3:]
                if fontize:
                    transliterated_text.append('\\tamil{%s}' % dn2tam(
                        str(transliterate(t, 'harvardkyoto', scr),
                            'utf8').title()))
                else:
                    transliterated_text.append(
                        dn2tam(
                            str(transliterate(t, 'harvardkyoto', scr),
                                'utf8').title()))

            else:
                if t.find('RIGHTarrow') == -1:
                    transliterated_text.append(
                        str(transliterate(t, 'harvardkyoto', scr),
                            'utf8').title())
                else:
                    [txt, t1, arrow, t2] = t.split('\\')
                    transliterated_text.append('\\'.join([
                        str(transliterate(txt, 'harvardkyoto', scr),
                            'utf8').title(), t1, arrow, t2
                    ]))
    else:
        for t in text_bits:
            t = t.rstrip('~0123456789 ')
            if t[:3] == 'ta:':
                # Force Tamil!
                t = t[3:]
                transliterated_text.append(
                    dn2tam(
                        str(transliterate(t, 'harvardkyoto', scr),
                            'utf8').title()))
            else:
                if t.find('RIGHTarrow') == -1:
                    transliterated_text.append(
                        str(transliterate(t, 'harvardkyoto', scr), 'utf8'))
                else:
                    [txt, t1, arrow, t2] = t.split('\\')
                    transliterated_text.append('\\'.join([
                        str(transliterate(txt, 'harvardkyoto', scr), 'utf8'),
                        t1, arrow, t2
                    ]))

    return '|'.join(transliterated_text)
def i2d(text):
    newtext = text.strip('|')
    # print(newtext,file=sys.stderr)
    if newtext[-1] == 'M':
        newtext = newtext[:-1] + 'm'
    text = newtext + '|'*(len(text)-len(newtext))

    text_parts = text.split()
    out_text_parts = []
    for t in text_parts:
        try:
            out_text = transliterator.transliterate(t, 'itrans', 'devanagari')
        except:
            e = sys.exc_info()[0]
            sys.stderr.write(
                'Error transliterating the string "%s"...\n' % (t))
            out_text = '##%s##' % t
        out_text_parts.append(out_text)

    return ' '.join(out_text_parts)
def i2d(text):
    newtext = text.strip('|')
    # print(newtext,file=sys.stderr)
    if newtext[-1] == 'M':
        newtext = newtext[:-1] + 'm'
    text = newtext + '|' * (len(text) - len(newtext))

    text_parts = text.split()
    out_text_parts = []
    for t in text_parts:
        try:
            out_text = transliterator.transliterate(t, 'itrans', 'devanagari')
        except:
            e = sys.exc_info()[0]
            sys.stderr.write('Error transliterating the string "%s"...\n' %
                             (t))
            out_text = '##%s##' % t
        out_text_parts.append(out_text)

    return ' '.join(out_text_parts)
Exemplo n.º 4
0
 def test_sequence_darija(self):
     result = transliterate("nta 7aamed w m3e9ed")
     self.assertEqual(result, "نت حامض و معقد")
Exemplo n.º 5
0
 def test_sequence(self):
     result = transliterate("ya 7mar")
     self.assertEqual(result, "يا حمار")
Exemplo n.º 6
0
    # loop through the files
    for f in files:
        with open(corpus_folder + f) as text:
            for line in text:

                try:
                    # exclude RN, ABB tags and words with w or q or y
                    word, pos = re.findall(
                        "(^.*?(?=\t)|(?<=\t)[A-PS-VZ][AC-NT-VZ]*(?=\t[A-Ža-ž])|(?<=[0-9]\t)[A-Ža-ž]+(?=\t<))",
                        line)
                # exclude NUM @card@, SENT, PUNCT and ? tags
                except:
                    pass

                # to single-character Cyrillic
                word = transliterate(word)
                # find letter indices
                nuclei_positions = find_syllable_nuclei(word)
                sonant_positions, fricative_positions, africate_positions, nasal_positions, plosive_positions = find_consonants(
                    word)

                # sillabify
                syllabified_word = syllabify(word, nuclei_positions,
                                             sonant_positions,
                                             fricative_positions,
                                             africate_positions,
                                             nasal_positions,
                                             plosive_positions)

                # get structure
                syllable_structure = get_syllable_structure(
Exemplo n.º 7
0
#NEED TO USE PYTHON 2.7 FOR THIS SCRIPT

import os
import re
import transliterator


name = re.compile('(\d\.\d\.\d )(\D*).xml', re.U)

files = [f for f in os.listdir(os.getcwd()) if name.match(f)]
files.sort()

for f in files:
    match = name.match(f)
    if(match):
        try:
            trans = transliterator.transliterate(match.group(2),'devanagari', 'iast')
        except:
            print('derped!')
            continue
        newName = match.group(1) + trans + '.xml'
        os.rename(f, newName)