Пример #1
0
 def is_chinese(self,text):
    ret = []
    if hanzidentifier.has_chinese(text): #chinese
      chineseArray = re.findall(ur'[\u4e00-\u9fff]+',text)
      chineseChars = len(str(chineseArray)) - (2+len(chineseArray))
      if len(text) / 3 < chineseChars: #At least 1/3 of the sentence in Chinese characteres
        if hanzidentifier.identify(text) is hanzidentifier.SIMPLIFIED:
          ret = [[1,'ZH-CHS']]
        elif hanzidentifier.identify(text) is hanzidentifier.TRADITIONAL:
          ret = [[1,'ZH-CHT']]
        elif hanzidentifier.identify(text) is hanzidentifier.BOTH or  hanzidentifier.identify(text) is hanzidentifier.MIXED:
          ret = [[1,'ZH-CHT'],[1,'ZH-CHS']]
    return ret
Пример #2
0
def checktype(s):
    def isEng(s):
        return (1 if str(type(re.match('[0-9A-z]', s))).find('re.Match') != -1
                else 0)

    if ha.identify(s) is ha.MIXED:
        return (1 if isEng(s) == 1 else 2)

    elif ha.identify(s) is ha.TRADITIONAL:
        return (3 if isEng(s) == 1 else 4)

    elif ha.identify(s) is ha.SIMPLIFIED:
        return (5 if isEng(s) == 1 else 6)

    elif ha.identify(s) is ha.BOTH:
        return (7 if isEng(s) == 1 else 8)

    else:
        return (9 if isEng(s) == 1 else 10)
Пример #3
0
    def check_text(text_input):
        """
        Checks to see if the input has Chinese characters, and whether or not it is in simplified or traditional. Returns text_input if
        it does, raises an error if none are found.

        Args:
            text_input (str): A string, containing Chinese characters.

        Returns:
            text_input (str): Original string.
        
        Raises:
            ChineseCharsNotFound: When no chinese characters are found in text_input.
        """
        char_type = hanzI.identify(text_input)
        if char_type == 0:
            raise ChineseCharsNotFound(text_input)
        else:
            return text_input
Пример #4
0
 def test_return_simp(self):
     text = u'Thomas 说:你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.SIMP)
Пример #5
0
 def languageRules(self, lang1, lang2, lang3, dLanguages, text):
   language = ''
   if 'tl' in dLanguages.keys():
     if (text.find('ggong') > -1) or (text.find(' ang ') > -1) or (text.find('oong') > -1) or (text.find(' pa ') > -1):
       language = 'tl'  
     if ( text.find(' na ') > -1) and ((lang1 != 'pt')  and  (lang2 != 'pt') ):
       language = 'tl'
     if ((lang2 != 'es')  and  (lang2 != 'fr')  and  (lang1 == 'tl') ) or ((lang1 != 'es')  and (lang1 != 'fr')  and  (lang2 == 'tl') ):     
       if (self.find_words(text, 'tayo')) or (text.find(' ni ') > -1) :
          language = 'tl' 
   language = self.compareLangs(['pt', 'es','it'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['nuevo'],['ed']], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['pt', 'id','it'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['ini'],['ed']], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['pt', 'es','fr'], [lang1, lang2, lang3], [['aqui','você','com','daqui'], ['es','nuevo'],['est']], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['en', 'pt', 'es'], [lang1, lang2, lang3], [['much','at','yes','day','you','this','is','access'], ['você'], []], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['en', 'id', 'it'], [lang1, lang2, lang3], [['much','about','yes','day','you','this','is','access'], ['ini'], ['ed']], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['en', 'fr', 'es'], [lang1, lang2, lang3], [['much','yes','day','you','this','is','access'], [], []], text)
   if len(language)>0:
     return language
   language = self.compareLangs(['en', 'it', 'fr'], [lang1, lang2, lang3], [['much','yes','day','you','this','is','access'], ['ed'], []], text)
   if len(language)>0:
     return language
   if hanzidentifier.identify(text) > 0:
        txt = self.only_chinese(text)
        if len(txt) > 1:
          ret = self.is_chinese(txt)
          if len(ret) > 0:
            language = ret[0][1]         
   if ((lang1 == 'id')  and  (lang2 == 'tr') )  or  ((lang2 == 'id')  and  (lang1 == 'tr') ) :     
     if text.find('ş') > -1:
       language = 'tr'
   elif ((lang2 == 'id')  and  (lang1 == 'tl') ) or ((lang1 == 'id')  and  (lang2 == 'tl') ):     
     if (self.find_words(text, 'nalang')) or (self.find_words(text, 'ngayon')) or (self.find_words(text, 'akong')) or (self.find_words(text, 'ba')) or (self.find_words(text, 'sa')) or (self.find_words(text, 'daw')) or (text.find('nasa') > -1) or (text.find(' at ') > -1)  or (text.find(' na ') > -1)  or (text.find('sila ') > -1) or (self.find_words(text, 'ka')):
        language = 'tl' 
     else:
        language = self.SWdetect_language(text)         
        if not( language in ['tl','id'] ):
           language = ''
   elif ((lang1 == 'tr')  and  (lang2 == 'az') ):     
     if text.find('ə') > -1:
        language = 'az'
   elif ((lang2 == 'ar')  and  (lang1 == 'fa') ) or ((lang1 == 'ar')  and  (lang2 == 'fa') ):     
     if self.findInText(['ﻩو','ة','دو'], text) or (self.find_words(text, 'سعيد')) or (self.find_words(text, 'حط')) :
       language = 'ar'
     elif self.findInText(['دوازده','وﻩ','پ','چ','ژ','گ'], text):       
       language = 'fa'  
   elif ((lang2 == 'en')  and  (lang1 == 'az') ) or ((lang1 == 'en')  and  (lang2 == 'az') ):     
     if (text.find('ç') > -1) or  (text.find('ə') > -1) or (text.find('ş') > -1):
        language = 'az'
     elif (text.find(' to ') > -1) or (text.find(' day ') > -1) or (text.find(' one ') > -1):
        language = 'en'  
   elif ((lang2 == 'en')  and  (lang1 == 'id') ) or ((lang1 == 'en')  and  (lang2 == 'id') ):     
     if (self.find_words(text, 'day')) or (self.find_words(text, 'your')) or (self.find_words(text, 'you')) or (self.find_words(text, 'all')) or (text.find("i'm") > -1) or (self.find_words(text, 'for')) or  (text.find('thy') > -1) or (text.find('ts') > -1) or (text.find(' my ') > -1) or (text.find(' are ') > -1) or (text.find("aren't") > -1):
        language = 'en' 
   elif ((lang2 == 'en')  and  (lang1 == 'pt') ) or ((lang1 == 'en')  and  (lang2 == 'pt') ):     
     if (text.find('you') > -1) or self.find_words(text, 'is') :
        language = 'en' 
     if (text.find('você') > -1):
        language = 'pt' 
   elif ((lang2 == 'fr')  and  (lang1 == 'es') ) or ((lang1 == 'fr')  and  (lang2 == 'es') ):     
     if (text.find('dades ') > -1) :
        language = 'es' 
   elif ((lang2 == 'fr')  and  (lang1 == 'pt') ) or ((lang1 == 'fr')  and  (lang2 == 'pt') ):     
     if (text.find('dades ') > -1) :
        language = 'pt' 
   elif ((lang2 == 'pt')  and  (lang1 == 'es') ) or ((lang1 == 'pt')  and  (lang2 == 'es') ):     
     if (self.find_words(text, 'lá')) or (text.find('ç') > -1) or (self.find_words(text, '(sel')) or self.find_words(text, 'lua') or (self.find_words(text, 'ali')):
        language = 'pt' 
     if (self.find_words(text, 'y')) or (self.find_words(text, 'has')) or (self.find_words(text, 'del')):
        language = 'es' 
   elif ((lang2 == 'en')  and  (lang1 == 'es') ) or ((lang1 == 'en')  and  (lang2 == 'es') ):     
     if (text.find('lly') > -1) or (text.find('you') > -1) or (text.find(' st') > -1) or (self.find_words(text, 'him'))  or (self.find_words(text, 'been'))  or (self.find_words(text, 'has'))  or (self.find_words(text, 'the')):
        language = 'en' 
     elif (text.find('leer') > -1):
        language = 'es' 
   elif ((lang2 == 'en')  and  (lang1 == 'tl') ) or ((lang1 == 'en')  and  (lang2 == 'tl') ):     
     if self.find_word_ending_with(text, 'ght') or (self.find_words(text, 'morning')) or (self.find_words(text, 'much')) or (text.find('easy') > -1) or (text.find('by ') > -1) or (self.find_words(text, 'i')) or (self.find_words(text, 'my')) or (self.find_words(text, 'you')) or (self.find_words(text, 'for')) or (self.find_words(text, 'the')):
        language = 'en'  
     elif (self.find_words(text, 'ka')) or (self.find_words(text, 'ako')) or (self.find_words(text, 'isa')):
        language = 'tl' 
   elif ((lang2 == 'en')  and  (lang1 == 'fr') ) or ((lang1 == 'en')  and  (lang2 == 'fr') ):     
     if (self.find_words(text, 'my')) or (self.find_words(text, 'yes')) or (self.find_words(text, 'be')) or (self.find_words(text, 'this')) or (self.find_words(text, 'for')) or  (text.find(' you ') > -1) or (text.find(' our ') > -1) or (text.find('will') > -1) or (text.find(' one') > -1) or (text.find('ee') > -1) :
        language = 'en' 
     elif (text.find('è') > -1):
        language = 'fr' 
     else:
        language = self.SWdetect_language(text)
   elif ((lang2 == 'en')  and  (lang1 == 'it') ) or ((lang1 == 'en')  and  (lang2 == 'it') ):   
     if (text.find('ey') > -1) or (self.find_words(text, 'call'))  or (text.find('thy') > -1) or self.find_word_ending_with(text, 'ed') or (self.find_words(text, 'is')) or (self.find_words(text, 'for'))  or (self.find_words(text, 'have')) or (self.find_words(text, 'to')) or (self.find_words(text, 'i')) or (self.find_words(text, 'all')) or (self.find_words(text, 'never')) or (text.find('well') > -1) or (text.find('ts') > -1) or (text.find('ing ') > -1) or (text.find(' of ') > -1):
        language = 'en' 
     if (text.find('è') > -1) or (self.find_words(text, 'ed'))  or (self.find_words(text, 'che')) :
        language = 'it' 
   elif ((lang1 == 'es')  and  (lang2 == 'en') ):     
     if (text.find('my') > -1) or (text.find('ck') > -1) or (self.find_words(text, 'i')) or (self.find_words(text, 'never')):
        language = 'en' 
   elif ((lang1 == 'it')  and  (lang2 == 'es') ):     
     if (text.find('nn') > -1) or (text.find('à') > -1)  or (text.find('ù') > -1):
        language = 'it' 
     elif (text.find('ñ') > -1) or (text.find(' ni ') > -1):
        language = 'es'
     else:
        language = self.SWdetect_language(text)
   elif ((lang1 == 'pt')  and  (lang2 == 'es') ) or ((lang2 == 'pt')  and  (lang1 == 'es') ): 
     if (text.find('ñ') > -1) or (text.find('ll') > -1) or (text.find(' y ') > -1)  or (text.find(' el ') > -1)  or (text.find(' hay') > -1):
        language = 'es'
     elif (text.find('ç') > -1) or (text.find(' e ') > -1)  or (text.find('à') > -1):
        language = 'pt'
     else:
        language = self.SWdetect_language(text)
   return language
 def test_return_mixed(self):
     self.assertEqual(hanzidentifier.MIXED, hanzidentifier.identify(MIXED))
Пример #7
0
 def filter_text(self):
     """
     Filters out any non-Chinese characters in text input.
     """
     return [x for x in self.text_input if hanzI.identify(x) > 0]
Пример #8
0
 def test_return_both(self):
     text = u'Country in simplified: 国. Country in traditional: 國'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.BOTH)
 def test_return_simplified(self):
     self.assertEqual(hanzidentifier.SIMPLIFIED,
                      hanzidentifier.identify(SIMPLIFIED))
 def test_return_unknown(self):
     self.assertEqual(hanzidentifier.UNKNOWN,
                      hanzidentifier.identify(UNKNOWN))
Пример #11
0
 def test_return_both(self):
     text = u'Country in simplified: 国. Country in traditional: 國'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.BOTH)
Пример #12
0
 def test_return_either(self):
     text = u'你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.EITHER)
Пример #13
0
 def test_return_trad(self):
     text = u'Thomas 說:你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.TRAD)
Пример #14
0
 def test_return_simp(self):
     text = u'Thomas 说:你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.SIMP)
Пример #15
0
 def test_return_none(self):
     text = 'Hello my name is Thomas.'
     self.assertIsNone(hanzidentifier.identify(text))
Пример #16
0
 def test_return_trad(self):
     text = u'Thomas 說:你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.TRAD)
Пример #17
0
 def test_return_either(self):
     text = u'你好!'
     self.assertEqual(hanzidentifier.identify(text), hanzidentifier.EITHER)
 def test_return_traditional(self):
     self.assertEqual(hanzidentifier.TRADITIONAL,
                      hanzidentifier.identify(TRADITIONAL))
Пример #19
0
 def test_return_none(self):
     text = 'Hello my name is Thomas.'
     self.assertIsNone(hanzidentifier.identify(text))
 def test_return_both(self):
     self.assertEqual(hanzidentifier.BOTH, hanzidentifier.identify(BOTH))