def is_valid(self, word): """ is a valid root""" # if word is null if not word: return False # if the word contains latin chars if not araby.is_arabicword(word): return False # if root is more than 4 letters or less than three letters return (len(word) >= 3 and len(word) <= 4 and araby.ALEF not in word)
def cleanFile(oldFile, newFile, rawFilesEncoding): with open(oldFile, 'r', encoding='utf-8') as myfile: x = myfile.read().split() x = [ tokenize(word) for word in x if not has_double_shadda(word) and is_arabicword(word) ] x = service.twoDJoin(x) print(oldFile, newFile) with open(newFile, 'w', encoding="u") as wF: wF.write(x)
def test_word_text(self): # is_vocalized(word) self.assertFalse(Araby.is_vocalized(u'العربية')) self.assertTrue(Araby.is_vocalized(u'الْعَرَبِيّةُ')) # is_vocalized(word) self.assertFalse(Araby.is_vocalizedtext(u"العربية لغة جميلة")) self.assertTrue(Araby.is_vocalizedtext(u'الْعَرَبيَّة لُغَةٌ جَمِيلَةٌ')) # is_arabicstring TODO: add more examples self.assertTrue(Araby.is_arabicstring(u'العربية')) # is_arabicrange TODO: add test # is_arabicword TODO: test other cases self.assertFalse(Araby.is_arabicword(u"")) self.assertFalse(Araby.is_arabicword(u"ْلاندخل")) # start with sukun self.assertFalse(Araby.is_arabicword(u'ؤكل')) # start with waw hamza above self.assertFalse(Araby.is_arabicword(u'ئكل')) # start with waw hamza above4 self.assertFalse(Araby.is_arabicword(u'ةدخل')) # start with teh_marbuta self.assertTrue(Araby.is_arabicword(u"العربية"))
def is_valid(self, stem): """ is a valid stem""" # if word is null if not stem: return False # test multiple stem #if all parts are null return False if ";" in stem: parts = [x for x in stem.split(';') if x] if not parts: return False else: # if the word contains latin chars if not araby.is_arabicword(stem): return False return True
def check_fields(self, fields): """ check fields """ voc = fields.get('vocalized', '') if not voc: return "Error: Empty vocalized" if not ar.is_arabicword(voc): return "Error: Invalid Arabic word " # not duplicated if voc in self.index: return "Error: Duplicated Entry " self.index.append(voc) # valid verb form if not ar.is_vocalized(voc): return "Error: Not Vocalized" # valid vocalization if not verify_tashkeel(voc): return "Error: Error in Vocalization " return "ok"
def mainly(): """ main test """ words =u"""ضلام ألام ضلال لام ظلام ضام غلام إلام نلام هلام ضخام سلام ملام ضلا ضمام تلام علام يلام ضلان كلام ضلتم""".split(" ") source = u"ضلام" normsource = normalize(source) normlist = [normalize(word) for word in words] for word in words: print u"\t".join([word, normalize(word)]).encode("utf8") condidates = filter(lambda w: normalize(w) == normsource, words) print "condidates", u"\t".join(condidates).encode('utf8') editlist = edits1(source) print "len(editlist)", len(editlist) validwords = [word for word in editlist if araby.is_arabicword(word)] print "len(validwords)", len(validwords) print u"\n".join(editlist).encode('utf8')
def mainly(): """ main test """ words = u"""ضلام ألام ضلال لام ظلام ضام غلام إلام نلام هلام ضخام سلام ملام ضلا ضمام تلام علام يلام ضلان كلام ضلتم""".split( " ") source = u"ضلام" normsource = normalize(source) normlist = [normalize(word) for word in words] for word in words: print u"\t".join([word, normalize(word)]).encode("utf8") condidates = filter(lambda w: normalize(w) == normsource, words) print "condidates", u"\t".join(condidates).encode('utf8') editlist = edits1(source) print "len(editlist)", len(editlist) validwords = [word for word in editlist if araby.is_arabicword(word)] print "len(validwords)", len(validwords) print u"\n".join(editlist).encode('utf8')
def check_fields(self, fields): """ check fields """ voc = fields.get('vocalized', '') unvoc = fields.get('unvocalized', '') if not voc: return "Error: Empty vocalized" if not ar.is_arabicword(voc): return "Error: Invalid Arabic word " if not is_valid_infinitive_verb(voc): return "Error: Invalid Arabic infinitive verb " # not duplicated if voc in self.index: if len(unvoc) <= 3: return "Warning: Duplicated Entry " else: return "Error: Duplicated Entry " self.index.append(voc) # valid verb form if not ar.is_vocalized(voc): return "Error: Not Vocalized" # valid vocalization if not verify_tashkeel(voc): return "Error: Error in Vocalization " return "ok"
print(araby.order(c), end=" ") print() word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى",
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
def accepted(self, word): """ test if word is accecpted word (correct) @param word: input text. @type word: unicode. @return: True if word is accepted rtype: boolean. """ result = self.analyzer.check_word(word) if result: # result has many cases if len(result) > 1: return True #one only case else: return not result[0].is_unknown() return False if __name__ == "__main__": print "test" myrepr = arabRepr.ArabicRepr() speller = SpellcheckClass() text = u" اللغه العربيه" voc = speller.spellcheck(text, True) # print myrepr.repr(voc).encode('utf8') for itemd in voc: if itemd.get('suggest', '') != '': for sug in itemd.get('suggest', '').split(';'): print sug.encode('utf8'), '\t', araby.is_arabicword(sug)
@param word: input text. @type word: unicode. @return: True if word is accepted rtype: boolean. """ result = self.analyzer.check_word(word); if result: # result has many cases if len(result)>1: return True; #one only case else : return not result[0].is_unknown(); return False; if __name__=="__main__": print "test"; myrepr=arabRepr.ArabicRepr(); speller=SpellcheckClass(); text=u" اللغه العربيه" voc = speller.spellcheck(text, True); # print myrepr.repr(voc).encode('utf8') for itemd in voc: if itemd.get('suggest','') !='': for sug in itemd.get('suggest','').split(';'): print sug.encode('utf8'),'\t', araby.is_arabicword(sug)
word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول",
def check_word(self, word, guessedtag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.strip_tatweel(word) word_vocalised = word word_nm = araby.strip_tashkeel(word) # get analysed details from cache if used if self.allow_cache_use and self.cache.is_already_checked(word_nm): #~ print (u"'%s'"%word).encode('utf8'), 'found' resulted_data = self.cache.get_checked(word_nm) else: resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis if araby.is_arabicword(word_nm): resulted_data += self.check_word_as_stopword(word_nm) #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء #~if self.tagger.has_verb_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_verb(word_nm) resulted_data += self.check_word_as_verb(word_nm) #print "is verb", rabti, len(resulted_data) #if word is noun #~if self.tagger.has_noun_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_noun(word_nm) resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: #print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8') #check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) #check if the word is nomralized and solution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data, self.fully_vocalized_input) # add word frequency information in tags resulted_data = self.add_word_frequency(resulted_data) # add the stemmed words details into Cache data_list_to_serialize = [w.__dict__ for w in resulted_data] if self.allow_cache_use: self.cache.add_checked(word_nm, data_list_to_serialize) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = self.check_partial_vocalized( word_vocalised, resulted_data) if len(resulted_data) == 0: error_code = self.get_error_code() resulted_data.append( wordcase.WordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': word, 'original': word, 'vocalized': word, 'semivocalized': word, 'tags': u'%s' % error_code, 'type': 'unknown', 'root': '', 'template': '', 'freq': self.wordfreq.get_freq(word, 'unknown'), 'syntax': '', })) return resulted_data
def is_valid_infinitive_verb(word, vocalized=True): """ Determine if the given word is a valid infinitive form of an arabic verb. A word is not valid infinitive if - lenght < 3 letters. - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT - contains TEH_MARBUTA, Tanwin - contains non arabic letters. - contains ALEF_MAKSURA not in the end. - contains double haraka : a warning @param word: given word. @type word: unicode. @param is_vocalized: if the given word is vocalized. @type is_vocalized:Boolean, default(True). @return: True if the word is a valid infinitive form of verb. @rtype: Boolean. """ # test if the word is an arabic valid word, if not araby.is_arabicword(word): return False if vocalized: word_nm = araby.strip_harakat(word) else: word_nm = word # the alef_madda is considered as 2 letters word_nm = word_nm.replace(ALEF_MADDA, HAMZA + ALEF) length = len(word_nm) # lenght with shadda must be between 3 and 6 if length < 3 or length >= 7: return False # a 3 length verb can't start by Alef or Shadda, #and the second letter can't be shadda elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \ or word_nm[1] == SHADDA): return False # a 5 length verb must start by ALEF or TEH elif length == 5 and word_nm[0] not in (TEH, ALEF): return False # a 6 length verb must start by ALEF elif length == 6 and word_nm[0] != ALEF: return False # contains some invalide letters in verb elif re.search( u"[%s%s%s%s%s]" % (ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word): return False # contains some SHADDA sequence letters in verb # Like shadda shadda, shadda on alef, start # by shadda, shadda on alef_ maksura, # ALEF folowed by (ALEF, ALEF_MAKSURA) # ALEF Folowed by a letter and ALEF # end with ALEF folowed by (YEH, ALEF_MAKSURA) # first letter is alef and ALLw alef and two letters aand shadda elif re.search( u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)" % (ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm): return False # Invalid root form some letters : #~ # initial YEH folowed by #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, #~ TAH, ZAH, GHAIN, KAF, HEH, YEH)) elif re.search( u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]" % (YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm): return False # TEH After (DAL, THAL, TAH, ZAH, DAD) elif re.search(u"[%s%s%s%s%s]%s" % (DAL, THAL, DAD, TAH, ZAH, TEH), word_nm): return False # Contains invalid root sequence in arabic, near in phonetic # like BEH and FEH, LAM And REH elif re.search( u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s" % (LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH, HEH), word_nm): return False # in non 5 letters verbs :initial TEH followed by # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH) elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in ( TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH): return False # if word start by the same letter doubled elif word_nm[0] == word_nm[1] and word[0] != TEH: return False #verify the wazn of the verb elif length == 3: if re.match("^[^%s][^%s].$" % (ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # اعل، فّل else: return False elif length == 4: #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل if re.match(\ "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\ %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA, ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # افعل: يجب تثبيت همزة القطع #فّعل، فعلّ: الشدة لها موضع خاص # فعال، فعلا: للألف موضع خاص else: return False elif length == 5: if word_nm.startswith(ALEF): if re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \ word_nm): return True # انفعل elif re.match(u"^ان...$", word_nm): return True #افتعل elif re.match(u"^(ازد|اصط|اضط)..$", word_nm): return True elif re.match(u"^ا[^صضطظد]ت..$", word_nm): return True elif re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع elif re.match(u"^ا.ّ..$", word_nm): return True elif re.match(u"^ا...ى$", word_nm): return True else: return False elif word_nm.startswith(TEH): return True else: return False # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، elif length == 6: if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)): return False if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، else: return False return True
def is_valid_infinitive_verb(word, vocalized = True): """ Determine if the given word is a valid infinitive form of an arabic verb. A word is not valid infinitive if - lenght < 3 letters. - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT - contains TEH_MARBUTA, Tanwin - contains non arabic letters. - contains ALEF_MAKSURA not in the end. - contains double haraka : a warning @param word: given word. @type word: unicode. @param is_vocalized: if the given word is vocalized. @type is_vocalized:Boolean, default(True). @return: True if the word is a valid infinitive form of verb. @rtype: Boolean. """ # test if the word is an arabic valid word, if not araby.is_arabicword(word): return False if vocalized : word_nm = araby.strip_harakat(word) else: word_nm = word # the alef_madda is considered as 2 letters word_nm = word_nm.replace(ALEF_MADDA, HAMZA+ALEF) length = len(word_nm) # lenght with shadda must be between 3 and 6 if length < 3 or length >= 7: return False # a 3 length verb can't start by Alef or Shadda, #and the second letter can't be shadda elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \ or word_nm[1] == SHADDA): return False # a 5 length verb must start by ALEF or TEH elif length == 5 and word_nm[0] not in (TEH, ALEF): return False # a 6 length verb must start by ALEF elif length == 6 and word_nm[0] != ALEF: return False # contains some invalide letters in verb elif re.search(u"[%s%s%s%s%s]"%(ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word): return False # contains some SHADDA sequence letters in verb # Like shadda shadda, shadda on alef, start # by shadda, shadda on alef_ maksura, # ALEF folowed by (ALEF, ALEF_MAKSURA) # ALEF Folowed by a letter and ALEF # end with ALEF folowed by (YEH, ALEF_MAKSURA) # first letter is alef and ALLw alef and two letters aand shadda elif re.search(u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)"%( ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm): return False # Invalid root form some letters : #~ # initial YEH folowed by #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, #~ TAH, ZAH, GHAIN, KAF, HEH, YEH)) elif re.search(u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]"%( YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm): return False # TEH After (DAL, THAL, TAH, ZAH, DAD) elif re.search(u"[%s%s%s%s%s]%s"%(DAL, THAL, DAD, TAH, ZAH, TEH), word_nm): return False # Contains invalid root sequence in arabic, near in phonetic # like BEH and FEH, LAM And REH elif re.search(u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s"%( LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH, HEH), word_nm): return False # in non 5 letters verbs :initial TEH followed by # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH) elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in ( TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH): return False # if word start by the same letter doubled elif word_nm[0] == word_nm[1] and word[0] != TEH: return False #verify the wazn of the verb elif length == 3: if re.match("^[^%s][^%s].$"%(ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # اعل، فّل else: return False elif length == 4: #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل if re.match(\ "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\ %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA, ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # افعل: يجب تثبيت همزة القطع #فّعل، فعلّ: الشدة لها موضع خاص # فعال، فعلا: للألف موضع خاص else: return False elif length == 5: if word_nm.startswith(ALEF): if re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \ word_nm): return True # انفعل elif re.match(u"^ان...$", word_nm): return True #افتعل elif re.match(u"^(ازد|اصط|اضط)..$", word_nm): return True elif re.match(u"^ا[^صضطظد]ت..$", word_nm): return True elif re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع elif re.match(u"^ا.ّ..$", word_nm): return True elif re.match(u"^ا...ى$", word_nm): return True else: return False elif word_nm.startswith(TEH): return True else: return False # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، elif length == 6: if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)): return False if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، else: return False return True