def is_chinese(self,text): ret = [] if hanzidentifier.has_chinese(text): #chinese chineseArray = re.findall(ur'[\u4e00-\u9fff]+',text) chineseChars = len(str(chineseArray)) - (2+len(chineseArray)) if len(text) / 3 < chineseChars: #At least 1/3 of the sentence in Chinese characteres if hanzidentifier.identify(text) is hanzidentifier.SIMPLIFIED: ret = [[1,'ZH-CHS']] elif hanzidentifier.identify(text) is hanzidentifier.TRADITIONAL: ret = [[1,'ZH-CHT']] elif hanzidentifier.identify(text) is hanzidentifier.BOTH or hanzidentifier.identify(text) is hanzidentifier.MIXED: ret = [[1,'ZH-CHT'],[1,'ZH-CHS']] return ret
def checktype(s): def isEng(s): return (1 if str(type(re.match('[0-9A-z]', s))).find('re.Match') != -1 else 0) if ha.identify(s) is ha.MIXED: return (1 if isEng(s) == 1 else 2) elif ha.identify(s) is ha.TRADITIONAL: return (3 if isEng(s) == 1 else 4) elif ha.identify(s) is ha.SIMPLIFIED: return (5 if isEng(s) == 1 else 6) elif ha.identify(s) is ha.BOTH: return (7 if isEng(s) == 1 else 8) else: return (9 if isEng(s) == 1 else 10)
def check_text(text_input): """ Checks to see if the input has Chinese characters, and whether or not it is in simplified or traditional. Returns text_input if it does, raises an error if none are found. Args: text_input (str): A string, containing Chinese characters. Returns: text_input (str): Original string. Raises: ChineseCharsNotFound: When no chinese characters are found in text_input. """ char_type = hanzI.identify(text_input) if char_type == 0: raise ChineseCharsNotFound(text_input) else: return text_input
def test_return_simp(self): text = u'Thomas 说:你好!' self.assertEqual(hanzidentifier.identify(text), hanzidentifier.SIMP)
def languageRules(self, lang1, lang2, lang3, dLanguages, text): language = '' if 'tl' in dLanguages.keys(): if (text.find('ggong') > -1) or (text.find(' ang ') > -1) or (text.find('oong') > -1) or (text.find(' pa ') > -1): language = 'tl' if ( text.find(' na ') > -1) and ((lang1 != 'pt') and (lang2 != 'pt') ): language = 'tl' if ((lang2 != 'es') and (lang2 != 'fr') and (lang1 == 'tl') ) or ((lang1 != 'es') and (lang1 != 'fr') and (lang2 == 'tl') ): if (self.find_words(text, 'tayo')) or (text.find(' ni ') > -1) : language = 'tl' language = self.compareLangs(['pt', 'es','it'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['nuevo'],['ed']], text) if len(language)>0: return language language = self.compareLangs(['pt', 'id','it'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['ini'],['ed']], text) if len(language)>0: return language language = self.compareLangs(['pt', 'es','fr'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['es','nuevo'],['est']], text) if len(language)>0: return language language = self.compareLangs(['en', 'pt', 'es'], [lang1, lang2, lang3], [['much','at','yes','day','you','this','is','access'], ['você'], []], text) if len(language)>0: return language language = self.compareLangs(['en', 'id', 'it'], [lang1, lang2, lang3], [['much','about','yes','day','you','this','is','access'], ['ini'], ['ed']], text) if len(language)>0: return language language = self.compareLangs(['en', 'fr', 'es'], [lang1, lang2, lang3], [['much','yes','day','you','this','is','access'], [], []], text) if len(language)>0: return language language = self.compareLangs(['en', 'it', 'fr'], [lang1, lang2, lang3], [['much','yes','day','you','this','is','access'], ['ed'], []], text) if len(language)>0: return language if hanzidentifier.identify(text) > 0: txt = self.only_chinese(text) if len(txt) > 1: ret = self.is_chinese(txt) if len(ret) > 0: language = ret[0][1] if ((lang1 == 'id') and (lang2 == 'tr') ) or ((lang2 == 'id') and (lang1 == 'tr') ) : if text.find('ş') > -1: language = 'tr' elif ((lang2 == 'id') and (lang1 == 'tl') ) or ((lang1 == 'id') and (lang2 == 'tl') ): if (self.find_words(text, 'nalang')) or (self.find_words(text, 'ngayon')) or (self.find_words(text, 'akong')) or (self.find_words(text, 'ba')) or (self.find_words(text, 'sa')) or (self.find_words(text, 'daw')) or (text.find('nasa') > -1) or (text.find(' at ') > -1) or (text.find(' na ') > -1) or (text.find('sila ') > -1) or (self.find_words(text, 'ka')): language = 'tl' else: language = self.SWdetect_language(text) if not( language in ['tl','id'] ): language = '' elif ((lang1 == 'tr') and (lang2 == 'az') ): if text.find('ə') > -1: language = 'az' elif ((lang2 == 'ar') and (lang1 == 'fa') ) or ((lang1 == 'ar') and (lang2 == 'fa') ): if self.findInText(['ﻩو','ة','دو'], text) or (self.find_words(text, 'سعيد')) or (self.find_words(text, 'حط')) : language = 'ar' elif self.findInText(['دوازده','وﻩ','پ','چ','ژ','گ'], text): language = 'fa' elif ((lang2 == 'en') and (lang1 == 'az') ) or ((lang1 == 'en') and (lang2 == 'az') ): if (text.find('ç') > -1) or (text.find('ə') > -1) or (text.find('ş') > -1): language = 'az' elif (text.find(' to ') > -1) or (text.find(' day ') > -1) or (text.find(' one ') > -1): language = 'en' elif ((lang2 == 'en') and (lang1 == 'id') ) or ((lang1 == 'en') and (lang2 == 'id') ): if (self.find_words(text, 'day')) or (self.find_words(text, 'your')) or (self.find_words(text, 'you')) or (self.find_words(text, 'all')) or (text.find("i'm") > -1) or (self.find_words(text, 'for')) or (text.find('thy') > -1) or (text.find('ts') > -1) or (text.find(' my ') > -1) or (text.find(' are ') > -1) or (text.find("aren't") > -1): language = 'en' elif ((lang2 == 'en') and (lang1 == 'pt') ) or ((lang1 == 'en') and (lang2 == 'pt') ): if (text.find('you') > -1) or self.find_words(text, 'is') : language = 'en' if (text.find('você') > -1): language = 'pt' elif ((lang2 == 'fr') and (lang1 == 'es') ) or ((lang1 == 'fr') and (lang2 == 'es') ): if (text.find('dades ') > -1) : language = 'es' elif ((lang2 == 'fr') and (lang1 == 'pt') ) or ((lang1 == 'fr') and (lang2 == 'pt') ): if (text.find('dades ') > -1) : language = 'pt' elif ((lang2 == 'pt') and (lang1 == 'es') ) or ((lang1 == 'pt') and (lang2 == 'es') ): if (self.find_words(text, 'lá')) or (text.find('ç') > -1) or (self.find_words(text, '(sel')) or self.find_words(text, 'lua') or (self.find_words(text, 'ali')): language = 'pt' if (self.find_words(text, 'y')) or (self.find_words(text, 'has')) or (self.find_words(text, 'del')): language = 'es' elif ((lang2 == 'en') and (lang1 == 'es') ) or ((lang1 == 'en') and (lang2 == 'es') ): if (text.find('lly') > -1) or (text.find('you') > -1) or (text.find(' st') > -1) or (self.find_words(text, 'him')) or (self.find_words(text, 'been')) or (self.find_words(text, 'has')) or (self.find_words(text, 'the')): language = 'en' elif (text.find('leer') > -1): language = 'es' elif ((lang2 == 'en') and (lang1 == 'tl') ) or ((lang1 == 'en') and (lang2 == 'tl') ): if self.find_word_ending_with(text, 'ght') or (self.find_words(text, 'morning')) or (self.find_words(text, 'much')) or (text.find('easy') > -1) or (text.find('by ') > -1) or (self.find_words(text, 'i')) or (self.find_words(text, 'my')) or (self.find_words(text, 'you')) or (self.find_words(text, 'for')) or (self.find_words(text, 'the')): language = 'en' elif (self.find_words(text, 'ka')) or (self.find_words(text, 'ako')) or (self.find_words(text, 'isa')): language = 'tl' elif ((lang2 == 'en') and (lang1 == 'fr') ) or ((lang1 == 'en') and (lang2 == 'fr') ): if (self.find_words(text, 'my')) or (self.find_words(text, 'yes')) or (self.find_words(text, 'be')) or (self.find_words(text, 'this')) or (self.find_words(text, 'for')) or (text.find(' you ') > -1) or (text.find(' our ') > -1) or (text.find('will') > -1) or (text.find(' one') > -1) or (text.find('ee') > -1) : language = 'en' elif (text.find('è') > -1): language = 'fr' else: language = self.SWdetect_language(text) elif ((lang2 == 'en') and (lang1 == 'it') ) or ((lang1 == 'en') and (lang2 == 'it') ): if (text.find('ey') > -1) or (self.find_words(text, 'call')) or (text.find('thy') > -1) or self.find_word_ending_with(text, 'ed') or (self.find_words(text, 'is')) or (self.find_words(text, 'for')) or (self.find_words(text, 'have')) or (self.find_words(text, 'to')) or (self.find_words(text, 'i')) or (self.find_words(text, 'all')) or (self.find_words(text, 'never')) or (text.find('well') > -1) or (text.find('ts') > -1) or (text.find('ing ') > -1) or (text.find(' of ') > -1): language = 'en' if (text.find('è') > -1) or (self.find_words(text, 'ed')) or (self.find_words(text, 'che')) : language = 'it' elif ((lang1 == 'es') and (lang2 == 'en') ): if (text.find('my') > -1) or (text.find('ck') > -1) or (self.find_words(text, 'i')) or (self.find_words(text, 'never')): language = 'en' elif ((lang1 == 'it') and (lang2 == 'es') ): if (text.find('nn') > -1) or (text.find('à') > -1) or (text.find('ù') > -1): language = 'it' elif (text.find('ñ') > -1) or (text.find(' ni ') > -1): language = 'es' else: language = self.SWdetect_language(text) elif ((lang1 == 'pt') and (lang2 == 'es') ) or ((lang2 == 'pt') and (lang1 == 'es') ): if (text.find('ñ') > -1) or (text.find('ll') > -1) or (text.find(' y ') > -1) or (text.find(' el ') > -1) or (text.find(' hay') > -1): language = 'es' elif (text.find('ç') > -1) or (text.find(' e ') > -1) or (text.find('à') > -1): language = 'pt' else: language = self.SWdetect_language(text) return language
def test_return_mixed(self): self.assertEqual(hanzidentifier.MIXED, hanzidentifier.identify(MIXED))
def filter_text(self): """ Filters out any non-Chinese characters in text input. """ return [x for x in self.text_input if hanzI.identify(x) > 0]
def test_return_both(self): text = u'Country in simplified: 国. Country in traditional: 國' self.assertEqual(hanzidentifier.identify(text), hanzidentifier.BOTH)
def test_return_simplified(self): self.assertEqual(hanzidentifier.SIMPLIFIED, hanzidentifier.identify(SIMPLIFIED))
def test_return_unknown(self): self.assertEqual(hanzidentifier.UNKNOWN, hanzidentifier.identify(UNKNOWN))
def test_return_either(self): text = u'你好!' self.assertEqual(hanzidentifier.identify(text), hanzidentifier.EITHER)
def test_return_trad(self): text = u'Thomas 說:你好!' self.assertEqual(hanzidentifier.identify(text), hanzidentifier.TRAD)
def test_return_none(self): text = 'Hello my name is Thomas.' self.assertIsNone(hanzidentifier.identify(text))
def test_return_traditional(self): self.assertEqual(hanzidentifier.TRADITIONAL, hanzidentifier.identify(TRADITIONAL))
def test_return_both(self): self.assertEqual(hanzidentifier.BOTH, hanzidentifier.identify(BOTH))