def __init__(self): self.template=os.path.join(os.path.dirname(__file__),\ 'transliterate.html') self.cmu = CMUDict() self.response = SilpaResponse(self.template)
class Transliterator(SilpaModule): def __init__(self): self.template=os.path.join(os.path.dirname(__file__),\ 'transliterate.html') self.cmu = CMUDict() self.response = SilpaResponse(self.template) def transliterate_en_ml(self, word): """ Transliterate English to Malayalam with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word,"ml_IN") def transliterate_en_kn(self, word): """ Transliterate English to Kannada with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word,"kn_IN") def transliterate_en_xx(self,word, target_lang): """ Transliterate English to any Indian Language. """ if target_lang=="en_IN" or target_lang=="en_US": return word if target_lang == "kn_IN": tx_str = self.transliterate_en_kn(word) return tx_str else: tx_str = self.transliterate_en_ml(word) if target_lang == "ml_IN": return tx_str #chain it through indic indic transliteratioin #first remove malayalam specific zwj tx_str = tx_str.replace(u'', '') # remove instances of zwnj if tx_str[-1:] == u'്' and (target_lang == "hi_IN"\ or target_lang == "gu_IN"\ or target_lang == "bn_IN" ) : tx_str = tx_str[:-(len(u'്'))] #remove the last virama' return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang) def transliterate_xx_en(self,word, src_lang): """ Transliterate Indian Language to English. """ if src_lang == "en_IN" or src_lang == "en_US": return word # TODO: the function is generic now so no need of testing the lanuguage # but since the indic_en contains only for kn_IN and ml_IN we need this # check. # Add all indic language to indic_en # remplace this block with single call to indic_en function if src_lang == "kn_IN": return self.transliterate_indic_en(word,src_lang) if not src_lang == "ml_IN": word = self.transliterate_indic_indic(word, src_lang, "ml_IN") return self.transliterate_indic_en(word,"ml_IN") def transliterate_iso15919(self, word, src_language): tx_str = "" index=0; word_length = len(word) for chr in word: index+=1 offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <=76: tx_str = tx_str[:-1] #remove the last 'a' if offset>0 and offset<=128: tx_str = tx_str + charmap["ISO15919"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:]=='a' and (src_language == "hi_IN"\ or src_language == "gu_IN"\ or src_language == "bn_IN" ) : if word_length == index and word_length>1: #if last letter tx_str = tx_str[:-1] #remove the last 'a' return tx_str .decode("utf-8") def transliterate_ipa(self, word, src_language): """ Transliterate the given word in src_language to IPA - International Phonetical Alphabet notation. """ tx_str = "" index=0; word_length = len(word) for chr in word: index+=1 if ord(chr) < 255 : #ASCII characters + English tx_str += chr continue offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <=76: tx_str = tx_str[:-(len('ə'))] #remove the last 'ə' if offset>0 and offset<=128: tx_str = tx_str + charmap["IPA"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:]=='ə' and (src_language == "hi_IN"\ or src_language == "gu_IN"\ or src_language == "bn_IN" ) : if word_length == index and word_length>1: #if last letter tx_str = tx_str[:-(len('ə'))] #remove the last 'a' return tx_str .decode("utf-8") def _malayalam_fixes(self, text): try: text = text.replace(u"മ് ",u"ം ") text = text.replace(u"മ്,",u"ം,") text = text.replace(u"മ്.",u"ം.") text = text.replace(u"മ്)",u"ം)") text = text.replace(u"ഩ",u"ന") text = text.replace(u"",u".") #danda by fullstop except: pass return text def transliterate_indic_indic(self, word, src_lang, target_lang) : """ Transliterate from an Indian languge word to another indian language word """ index = 0 tx_str = "" word = normalizer.normalize(word) if src_lang == "ml_IN" and target_lang != "ml_IN" : word = word.replace(u"\u200C",u"") word = word.replace(u"\u200D",u"") #replace all samvruthokaram by u vowels word = word.replace(u"ു്",u"") for chr in word: index += 1 if chr in string.punctuation or (ord(chr)<=2304 and ord(chr)>=3071): tx_str = tx_str + chr continue offset = ord(chr) + self.getOffset(src_lang, target_lang) if(offset>0): tx_str = tx_str + unichr (offset) #schwa deletion baseoffset = offset - lang_bases[target_lang] #76 : virama if (index == len(word) and baseoffset == 76 and ( target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "pa_IN" or target_lang == "bn_IN")) : #TODO Add more languages having schwa deletion characteristic tx_str = tx_str[:-(len(chr))] #remove the last 'a' if target_lang == "ml_IN" and src_lang == "ta_IN": tx_str = tx_str.replace(u"ഩ" , u"ന") if target_lang == "ta_IN": tx_str = tx_str.replace(u'\u0B96' , u"க") tx_str = tx_str.replace(u'\u0B97' , u"க") tx_str = tx_str.replace(u'\u0B98' , u"க") tx_str = tx_str.replace(u'\u0B9B' , u"ச") tx_str = tx_str.replace(u'\u0B9D' , u"ச") tx_str = tx_str.replace(u'\u0BA0' , u"ட") tx_str = tx_str.replace(u'\u0BA1' , u"ட") tx_str = tx_str.replace(u'\u0BA2' , u"ட") tx_str = tx_str.replace(u'\u0BA5' , u"த") tx_str = tx_str.replace(u'\u0BA6' , u"த") tx_str = tx_str.replace(u'\u0BA7' , u"த") tx_str = tx_str.replace(u'\u0BAB' , u"ப") tx_str = tx_str.replace(u'\u0BAC' , u"ப") tx_str = tx_str.replace(u'\u0BAD' , u"ப") tx_str = tx_str.replace(u'\u0BC3' , u"ிரு") tx_str = tx_str.replace(u'ஂ',u'ம்') #If target is malayalam, we need to add the virama if ( (target_lang == "ml_IN") and (src_lang == "hi_IN" or src_lang == "gu_IN" or src_lang == "pa_IN" or src_lang == "bn_IN") and tx_str[-1].isalpha() ): tx_str = tx_str+u"്" return tx_str def transliterate_indic_en(self,word,src_lang): """ Arguments: - `self`: - `word`: Word to be transliterated (sentence) - `src_lang`: Language from which we need to transilterate """ # Get all the language related stuffs dictionary = get_dictionary_for(src_lang) vowels = get_vowels_for(src_lang) vowel_signs = get_vowel_signs_for(src_lang) virama = get_virama_for(src_lang) anuswara = get_anuswara_for(src_lang) word_length = len(word) index = 0 tx_string = "" while index < word_length: # If current charachter is a punctuation symbol # skip it. # Added to avoid getting extra 'a' to the begining # of word next to punctuation symbol # if word[index] in string.punctuation: tx_string += word[index] index += 1 continue # Virama = conjucter if word[index] == virama: index+=1 continue; # Get english equivalaent of the charachter. try: tx_string += dictionary[word[index]] except KeyError: # If charachter isn't present in the dict # just append the charachter to string # This case is now handled by punctuation checking tx_string += word[index] if index+1 < word_length and not word[index+1] in vowel_signs\ and word[index+1] in dictionary \ and not word[index] in vowels\ and not word[index] in vowel_signs : tx_string +='a' if index+1 == word_length and not word[index] in vowel_signs\ and word[index] in dictionary: tx_string +='a' #handle am sign if index+1 < word_length and word[index+1] == anuswara\ and not word[index] in vowel_signs: tx_string += 'a' index+=1 return tx_string @ServiceMethod def transliterate(self,text, target_lang_code): tx_str="" lines=text.split("\n") for line in lines: words=line.split(" ") for word in words: if(word.strip()>""): try: src_lang_code=detect_lang(word)[word] except: tx_str = tx_str + " " + word continue #FIXME if target_lang_code=="ISO15919" : tx_str=tx_str + \ self.transliterate_iso15919(word, src_lang_code)\ + " " continue if target_lang_code=="IPA" : tx_str=tx_str + \ self.transliterate_ipa(word, src_lang_code) + " " continue if src_lang_code=="en_US" : tx_str = tx_str + \ self.transliterate_en_xx(word, target_lang_code)+" " continue if target_lang_code=="en_US" or target_lang_code=="en_IN" : tx_str=tx_str + \ self.transliterate_xx_en(word, src_lang_code) + " " continue tx_str += self.transliterate_indic_indic(word,\ src_lang_code,\ target_lang_code) tx_str = tx_str + " " else: tx_str = tx_str + word tx_str = tx_str + "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str def getOffset(self,src,target): src_id=0 target_id=0 try: src_id=lang_bases[src] target_id=lang_bases[target] return (target_id - src_id) except: return 0 def get_module_name(self): return "Transliterator" def get_info(self): return "Transliterate the text between any Indian Language"
def __init__(self): self.cmu = CMUDict() self.normalizer = normalizer.getInstance()
# -*- coding: utf-8 -*- from cmudict import CMUDict if __name__ == '__main__': dct = CMUDict('data/cmudict-0.7b') for word in dct.words(): print(word) print(dct[word])