def test_langdetect(): '''TEST: Language Detection''' assert detect_lang(u'ನಮಸ್ಕಾರ')[u'ನಮಸ್ಕಾರ'] == 'kn_IN' assert detect_lang(u'बॆंगलूरु')[u'बॆंगलूरु'] == 'hi_IN' assert detect_lang(u'বাংগ্লা')[u'বাংগ্লা'] == 'bn_IN' assert detect_lang(u'മലയാളം')[u'മലയാളം'] == 'ml_IN' assert detect_lang(u'தமிள்')[u'தமிள்'] == 'ta_IN' assert detect_lang(u'తెలుగు')[u'తెలుగు'] == 'te_IN' assert detect_lang(u'ରିଯା')[u'ରିଯା'] == 'or_IN' assert detect_lang(u'ਪਂਜਾਬਿ')[u'ਪਂਜਾਬਿ'] == 'pa_IN' assert detect_lang(u'ગુજરાતિ')[u'ગુજરાતિ'] == 'gu_IN' assert detect_lang("English")["English"] == 'en_US'
def transliterate(self, text, target_lang_code): """ :param text: The text to be transliterated. :type text: str. :param target_lang_code: The language into which word has to be transliterated. :type target_lang_code: str. :returns: the transliterated text. The transliteration functioon which can transliterate text to the supported target languages. """ tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if (word.strip() > ""): try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = (tx_str + self.transliterate_iso15919( word, src_lang_code) + " ") continue if target_lang_code == "IPA": tx_str = (tx_str + self.transliterate_ipa(word, src_lang_code) + " ") continue if src_lang_code == "en_US": tx_str = (tx_str + self.transliterate_en_xx( word, target_lang_code) + " ") continue if target_lang_code == "en_US" or \ target_lang_code == "en_IN": tx_str = (tx_str + self.transliterate_xx_en( word, src_lang_code) + " ") continue tx_str += self.transliterate_indic_indic( word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str
def guessLanguage(self, text): lang = guessLanguageName(text) if lang == 'UNKNOWN': firstWord = text.split()[0] lang = detect_lang(firstWord)[firstWord] lang = _getName(lang.split("_")[0]) return lang
def syllabify(self, text): """ syllabifies the given text :param text: the input text. :type text: str. :returns: text with syllables marked. """ if text.strip() == "": return [] lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]] if(lang == "ml_IN"): return self.syllabify_ml(text) if(lang == "hi_IN"): return self.syllabify_hi(text) if(lang == "kn_IN"): return self.syllabify_kn(text) if(lang == "bn_IN"): return self.syllabify_bn(text) if(lang == "ta_IN"): return self.syllabify_ta(text) if(lang == "en_US"): return self.syllabify_en(text) lst_chars = [] for char in text: lst_chars.append(char) return lst_chars
def syllabify(self, text): """ syllabifies the given text :param text: the input text. :type text: str. :returns: text with syllables marked. """ if text.strip() == "": return [] lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]] if (lang == "ml_IN"): return self.syllabify_ml(text) if (lang == "hi_IN"): return self.syllabify_hi(text) if (lang == "kn_IN"): return self.syllabify_kn(text) if (lang == "bn_IN"): return self.syllabify_bn(text) if (lang == "ta_IN"): return self.syllabify_ta(text) if (lang == "en_US"): return self.syllabify_en(text) lst_chars = [] for char in text: lst_chars.append(char) return lst_chars
def transliterate(self, text, target_lang_code): """ :param text: The text to be transliterated. :type text: str. :param target_lang_code: The language into which word has to be transliterated. :type target_lang_code: str. :returns: the translated text. The transliteration functioon which can transliterate text to the supported target languages. """ tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if word.strip() > "": try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = tx_str + self.transliterate_iso15919(word, src_lang_code) + " " continue if target_lang_code == "IPA": tx_str = tx_str + self.transliterate_ipa(word, src_lang_code) + " " continue if src_lang_code == "en_US": tx_str = tx_str + self.transliterate_en_xx(word, target_lang_code) + " " continue if target_lang_code == "en_US" or target_lang_code == "en_IN": tx_str = tx_str + self.transliterate_xx_en(word, src_lang_code) + " " continue tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str
def transliterate(self, text, target_lang_code): tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if word.strip() > "": try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = tx_str + self.transliterate_iso15919(word, src_lang_code) + " " continue if target_lang_code == "IPA": tx_str = tx_str + self.transliterate_ipa(word, src_lang_code) + " " continue if src_lang_code == "en_US": tx_str = tx_str + self.transliterate_en_xx(word, target_lang_code) + " " continue if target_lang_code == "en_US" or target_lang_code == "en_IN": tx_str = tx_str + self.transliterate_xx_en(word, src_lang_code) + " " continue tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str
def getScriptName(self, text): return dumps(detect_lang(text))
def guessLanguageId(self, text): lang = guessLanguage(text) if lang == 'UNKNOWN': firstWord = text.split()[0] lang = detect_lang(firstWord)[firstWord] return lang
def getScriptName(self, text): return detect_lang(text)