def is_valid(self, word):
     """ is a valid root"""
     # if word is null
     if not word:
         return False
     # if the word contains latin chars
     if not araby.is_arabicword(word):
         return False
     # if root is more than 4 letters or less than three letters
     return (len(word) >= 3 and len(word) <= 4 and araby.ALEF not in word)
def cleanFile(oldFile, newFile, rawFilesEncoding):
    with open(oldFile, 'r', encoding='utf-8') as myfile:
        x = myfile.read().split()
        x = [
            tokenize(word) for word in x
            if not has_double_shadda(word) and is_arabicword(word)
        ]
        x = service.twoDJoin(x)
        print(oldFile, newFile)
        with open(newFile, 'w', encoding="u") as wF:
            wF.write(x)
Пример #3
0
    def test_word_text(self):

        # is_vocalized(word)
        self.assertFalse(Araby.is_vocalized(u'العربية'))
        self.assertTrue(Araby.is_vocalized(u'الْعَرَبِيّةُ'))

        # is_vocalized(word)
        self.assertFalse(Araby.is_vocalizedtext(u"العربية لغة جميلة"))
        self.assertTrue(Araby.is_vocalizedtext(u'الْعَرَبيَّة لُغَةٌ جَمِيلَةٌ'))

        # is_arabicstring TODO: add more examples
        self.assertTrue(Araby.is_arabicstring(u'العربية'))

        # is_arabicrange TODO: add test

        # is_arabicword TODO: test other cases

        self.assertFalse(Araby.is_arabicword(u""))

        self.assertFalse(Araby.is_arabicword(u"ْلاندخل")) # start with sukun

        self.assertFalse(Araby.is_arabicword(u'ؤكل')) # start with waw hamza above
        self.assertFalse(Araby.is_arabicword(u'ئكل')) # start with waw hamza above4
        self.assertFalse(Araby.is_arabicword(u'ةدخل')) # start with teh_marbuta

        self.assertTrue(Araby.is_arabicword(u"العربية"))
 def is_valid(self, stem):
     """ is a valid stem"""
     # if word is null
     if not stem:
         return False
     # test multiple stem
     #if all parts are null return False
     if ";" in stem:
         parts = [x for x in stem.split(';') if x]
         if not parts:
             return False
     else:
         # if the word contains latin chars
         if not araby.is_arabicword(stem):
             return False
     return True
Пример #5
0
 def check_fields(self, fields):
     """ check fields """
     voc = fields.get('vocalized', '')
     if not voc:
         return "Error: Empty vocalized"
     if not ar.is_arabicword(voc):
         return "Error: Invalid Arabic word "
     # not duplicated
     if voc in self.index:
         return "Error: Duplicated Entry "
     self.index.append(voc)
     # valid verb form
     if not ar.is_vocalized(voc):
         return "Error: Not Vocalized"
     # valid vocalization
     if not verify_tashkeel(voc):
         return "Error: Error in Vocalization "
     return "ok"
Пример #6
0
def mainly():
    """
    main test
    """
    words =u"""ضلام ألام ضلال لام ظلام ضام غلام إلام نلام هلام ضخام سلام ملام ضلا ضمام تلام علام يلام ضلان كلام ضلتم""".split(" ")

    source = u"ضلام"
    normsource = normalize(source)
    normlist = [normalize(word) for word in words]
    for word in words:
        print u"\t".join([word, normalize(word)]).encode("utf8")
    condidates = filter(lambda w: normalize(w) == normsource, words)
    print "condidates", u"\t".join(condidates).encode('utf8')
    editlist =  edits1(source)
    print "len(editlist)", len(editlist)
    validwords = [word for word in editlist if araby.is_arabicword(word)]
    print "len(validwords)", len(validwords)
    print u"\n".join(editlist).encode('utf8')
Пример #7
0
def mainly():
    """
    main test
    """
    words = u"""ضلام ألام ضلال لام ظلام ضام غلام إلام نلام هلام ضخام سلام ملام ضلا ضمام تلام علام يلام ضلان كلام ضلتم""".split(
        " ")

    source = u"ضلام"
    normsource = normalize(source)
    normlist = [normalize(word) for word in words]
    for word in words:
        print u"\t".join([word, normalize(word)]).encode("utf8")
    condidates = filter(lambda w: normalize(w) == normsource, words)
    print "condidates", u"\t".join(condidates).encode('utf8')
    editlist = edits1(source)
    print "len(editlist)", len(editlist)
    validwords = [word for word in editlist if araby.is_arabicword(word)]
    print "len(validwords)", len(validwords)
    print u"\n".join(editlist).encode('utf8')
Пример #8
0
 def check_fields(self, fields):
     """ check fields """
     voc = fields.get('vocalized', '')
     unvoc = fields.get('unvocalized', '')
     if not voc:
         return "Error: Empty vocalized"
     if not ar.is_arabicword(voc):
         return "Error: Invalid Arabic word "
     if not is_valid_infinitive_verb(voc):
         return "Error: Invalid Arabic infinitive verb "
     # not duplicated
     if voc in self.index:
         if len(unvoc) <= 3:
             return "Warning: Duplicated Entry "
         else:
             return "Error: Duplicated Entry "
     self.index.append(voc)
     # valid verb form
     if not ar.is_vocalized(voc):
         return "Error: Not Vocalized"
     # valid vocalization
     if not verify_tashkeel(voc):
         return "Error: Error in Vocalization "
     return "ok"
Пример #9
0
    print(araby.order(c), end=" ")
    print()
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
Пример #10
0
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
    if araby.is_vocalizedtext(word): print (' is vocalized text')
    if araby.is_arabicword(word): print (' is valid word')
    else: print ("invalid arabic word")
    print (' strip harakat', araby.strip_harakat(word))
    print (' strip tashkeel', araby.strip_tashkeel(word))
    print (' strip tatweel',araby.strip_tatweel(word))
    print (' normalize ligature ', araby.normalize_ligature(word))
    print (' normalize hamza', araby.normalize_hamza(word))
    if araby.vocalizedlike(word, word1): print ("vocalized_like")
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")

Пример #11
0
    def accepted(self, word):
        """
        test if  word is accecpted word (correct)
        @param word: input text.
        @type word: unicode.
        @return: True if word is accepted
        rtype: boolean.
        """
        result = self.analyzer.check_word(word)
        if result:
            # result has many cases
            if len(result) > 1:
                return True
            #one only case
            else:
                return not result[0].is_unknown()
        return False


if __name__ == "__main__":
    print "test"
    myrepr = arabRepr.ArabicRepr()
    speller = SpellcheckClass()
    text = u" اللغه العربيه"
    voc = speller.spellcheck(text, True)
    # print myrepr.repr(voc).encode('utf8')
    for itemd in voc:
        if itemd.get('suggest', '') != '':
            for sug in itemd.get('suggest', '').split(';'):
                print sug.encode('utf8'), '\t', araby.is_arabicword(sug)
Пример #12
0
        @param word: input text.
        @type word: unicode.
        @return: True if word is accepted
        rtype: boolean.
        """
        result = self.analyzer.check_word(word);
        if result:
            # result has many cases
            if len(result)>1:
                return True;
            #one only case
            else :
                return not result[0].is_unknown();
        return False;





if __name__=="__main__":
    print "test";
    myrepr=arabRepr.ArabicRepr();
    speller=SpellcheckClass();
    text=u" اللغه العربيه"
    voc = speller.spellcheck(text, True);
    # print myrepr.repr(voc).encode('utf8')
    for itemd in voc:
        if itemd.get('suggest','') !='':
            for sug in itemd.get('suggest','').split(';'):
                print sug.encode('utf8'),'\t', araby.is_arabicword(sug)
Пример #13
0
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print;
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like",
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
Пример #14
0
    def check_word(self, word, guessedtag=""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """

        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.is_already_checked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.get_checked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems,
            # the stop word can also be another normal word (verb or noun),
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            if araby.is_arabicword(word_nm):
                resulted_data += self.check_word_as_stopword(word_nm)

                #if word is verb
                # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
                #~if  self.tagger.has_verb_tag(guessedtag) or \
                #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_verb(word_nm)
                resulted_data += self.check_word_as_verb(word_nm)
                #print "is verb", rabti, len(resulted_data)
                #if word is noun
                #~if self.tagger.has_noun_tag(guessedtag) or \
                #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_noun(word_nm)
                resulted_data += self.check_word_as_noun(word_nm)
            if len(resulted_data) == 0:
                #print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8')
                #check the word as unkonwn
                resulted_data += self.check_word_as_unknown(word_nm)
                #check if the word is nomralized and solution are equivalent
            resulted_data = self.check_normalized(word_vocalised,
                                                  resulted_data)
            #check if the word is shadda like

            resulted_data = self.check_shadda(word_vocalised, resulted_data,
                                              self.fully_vocalized_input)

            # add word frequency information in tags
            resulted_data = self.add_word_frequency(resulted_data)

            # add the stemmed words details into Cache
            data_list_to_serialize = [w.__dict__ for w in resulted_data]
            if self.allow_cache_use:
                self.cache.add_checked(word_nm, data_list_to_serialize)

        #check if the word is vocalized like results
        if self.partial_vocalization_support:
            resulted_data = self.check_partial_vocalized(
                word_vocalised, resulted_data)

        if len(resulted_data) == 0:
            error_code = self.get_error_code()
            resulted_data.append(
                wordcase.WordCase({
                    'word':
                    word,
                    'affix': ('', '', '', ''),
                    'stem':
                    word,
                    'original':
                    word,
                    'vocalized':
                    word,
                    'semivocalized':
                    word,
                    'tags':
                    u'%s' % error_code,
                    'type':
                    'unknown',
                    'root':
                    '',
                    'template':
                    '',
                    'freq':
                    self.wordfreq.get_freq(word, 'unknown'),
                    'syntax':
                    '',
                }))
        return resulted_data
Пример #15
0
def is_valid_infinitive_verb(word, vocalized=True):
    """
    Determine if the given word is a valid infinitive form of an arabic verb.
    A word is not valid  infinitive if
        - lenght < 3 letters.
        - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT
        - contains TEH_MARBUTA, Tanwin
        - contains non arabic letters.
        - contains ALEF_MAKSURA not in the end.
        - contains double haraka : a warning
    @param word: given word.
    @type word: unicode.
    @param is_vocalized: if the given word is vocalized.
    @type is_vocalized:Boolean, default(True).
    @return: True if the word is a valid infinitive form of verb.
    @rtype: Boolean.
    """
    # test if the word is an arabic valid word,
    if not araby.is_arabicword(word):
        return False
    if vocalized:
        word_nm = araby.strip_harakat(word)
    else:
        word_nm = word
    # the alef_madda is  considered as 2 letters

    word_nm = word_nm.replace(ALEF_MADDA, HAMZA + ALEF)
    length = len(word_nm)

    # lenght with shadda must be between 3 and 6
    if length < 3 or length >= 7:
        return False
    # a 3 length verb can't start by Alef or Shadda,
    #and the second letter can't be shadda
    elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \
    or word_nm[1] == SHADDA):
        return False

    # a 5 length verb must start by ALEF or TEH
    elif length == 5 and word_nm[0] not in (TEH, ALEF):
        return False
    # a 6 length verb must start by ALEF
    elif length == 6 and word_nm[0] != ALEF:
        return False

    # contains some invalide letters in verb
    elif re.search(
            u"[%s%s%s%s%s]" %
        (ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word):
        return False
    # contains some SHADDA sequence letters in verb
    # Like shadda shadda, shadda on alef, start
    # by shadda, shadda on alef_ maksura,
    # ALEF folowed by (ALEF, ALEF_MAKSURA)
    # ALEF Folowed by a letter and ALEF
    # end with ALEF folowed by (YEH, ALEF_MAKSURA)
    # first letter is alef and ALLw alef and two letters aand shadda
    elif re.search(
            u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)" %
        (ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA,
         ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm):
        return False

    # Invalid root form some letters :
    #~ # initial YEH folowed by
    #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD,
    #~ TAH, ZAH, GHAIN, KAF, HEH, YEH))
    elif re.search(
            u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]" %
        (YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH,
         GHAIN, KAF, HEH, YEH), word_nm):
        return False

    # TEH After (DAL, THAL, TAH, ZAH, DAD)
    elif re.search(u"[%s%s%s%s%s]%s" % (DAL, THAL, DAD, TAH, ZAH, TEH),
                   word_nm):
        return False
    # Contains invalid root sequence in arabic, near in phonetic
    # like BEH and FEH, LAM And REH
    elif re.search(
            u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s" %
        (LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH,
         HEH), word_nm):
        return False

    # in non 5 letters verbs :initial TEH followed by
    # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH)
    elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in (
            TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH):
        return False
    # if word start by the same letter doubled
    elif word_nm[0] == word_nm[1] and word[0] != TEH:
        return False

    #verify the wazn of the verb
    elif length == 3:
        if re.match("^[^%s][^%s].$" % (ALEF, SHADDA), word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    # اعل، فّل
        else:
            return False
    elif length == 4:
        #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل
        if re.match(\
        "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\
        %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA,
         ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm):

            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #     افعل: يجب تثبيت همزة القطع
    #فّعل، فعلّ: الشدة لها موضع خاص
    # فعال، فعلا: للألف موضع خاص
        else:
            return False
    elif length == 5:

        if word_nm.startswith(ALEF):
            if re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \
             word_nm):
                return True

            # انفعل
            elif re.match(u"^ان...$", word_nm):
                return True
            #افتعل
            elif re.match(u"^(ازد|اصط|اضط)..$", word_nm):
                return True
            elif re.match(u"^ا[^صضطظد]ت..$", word_nm):
                return True
            elif re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            elif re.match(u"^ا.ّ..$", word_nm):
                return True
            elif re.match(u"^ا...ى$", word_nm):
                return True
            else:
                return False
        elif word_nm.startswith(TEH):
            return True
        else:
            return False

    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
    elif length == 6:
        if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)):
            return False
        if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
        else:
            return False
    return True
Пример #16
0
def is_valid_infinitive_verb(word, vocalized = True):
    """
    Determine if the given word is a valid infinitive form of an arabic verb.
    A word is not valid  infinitive if
        - lenght < 3 letters.
        - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT
        - contains TEH_MARBUTA, Tanwin
        - contains non arabic letters.
        - contains ALEF_MAKSURA not in the end.
        - contains double haraka : a warning
    @param word: given word.
    @type word: unicode.
    @param is_vocalized: if the given word is vocalized.
    @type is_vocalized:Boolean, default(True).
    @return: True if the word is a valid infinitive form of verb.
    @rtype: Boolean.
    """
    # test if the word is an arabic valid word, 
    if not  araby.is_arabicword(word):
        return False
    if vocalized :
        word_nm  =  araby.strip_harakat(word)
    else:
        word_nm = word
    # the alef_madda is  considered as 2 letters

    word_nm = word_nm.replace(ALEF_MADDA, HAMZA+ALEF)
    length = len(word_nm)

    # lenght with shadda must be between 3 and 6
    if length < 3  or length >= 7:
        return False
    # a 3 length verb can't start by Alef or Shadda, 
    #and the second letter can't be shadda
    elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \
    or word_nm[1] == SHADDA):
        return False

    # a 5 length verb must start by ALEF or TEH
    elif length == 5 and word_nm[0] not in (TEH, ALEF):
        return False
    # a 6 length verb must start by ALEF
    elif length == 6 and word_nm[0] !=  ALEF:
        return False

    # contains some invalide letters in verb
    elif re.search(u"[%s%s%s%s%s]"%(ALEF_HAMZA_BELOW, TEH_MARBUTA, 
    DAMMATAN, KASRATAN, FATHATAN), word):
        return False
    # contains some SHADDA sequence letters in verb
    # Like shadda shadda, shadda on alef, start  
    # by shadda, shadda on alef_ maksura, 
    # ALEF folowed by (ALEF, ALEF_MAKSURA)
    # ALEF Folowed by a letter and ALEF
    # end with ALEF folowed by (YEH, ALEF_MAKSURA)
    # first letter is alef and ALLw alef and two letters aand shadda
    elif re.search(u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)"%(
    ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, 
    ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm):
        return False


    # Invalid root form some letters :
    #~ # initial YEH folowed by 
    #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD,
     #~ TAH, ZAH, GHAIN, KAF, HEH, YEH))
    elif re.search(u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]"%(
    YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, 
    TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm):
        return False


       # TEH After (DAL, THAL, TAH, ZAH, DAD)
    elif re.search(u"[%s%s%s%s%s]%s"%(DAL, THAL, DAD, TAH, ZAH, TEH), word_nm):
        return False
    # Contains invalid root sequence in arabic, near in phonetic
    # like BEH and FEH, LAM And REH
    elif re.search(u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s"%(
    LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON,
     LAM, HEH, HAH, HAH, HEH), word_nm):
        return False


    # in non 5 letters verbs :initial TEH followed by  
    # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH)
    elif length !=  5 and word_nm.startswith(TEH) and word_nm[1] in (
    TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH):
        return False
    # if word start by the same letter doubled
    elif word_nm[0] == word_nm[1] and word[0] !=  TEH:
        return False

    #verify the wazn of the verb
    elif length == 3:
        if re.match("^[^%s][^%s].$"%(ALEF, SHADDA), word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    # اعل، فّل
        else: return False
    elif length == 4:
    #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل
        if re.match(\
        "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\
        %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA,
         ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm):

            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #     افعل: يجب تثبيت همزة القطع
    #فّعل، فعلّ: الشدة لها موضع خاص
    # فعال، فعلا: للألف موضع خاص
        else: return False
    elif length == 5:

        if  word_nm.startswith(ALEF):
            if re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \
             word_nm):
                return True

            # انفعل
            elif re.match(u"^ان...$", word_nm):
                return True
            #افتعل
            elif re.match(u"^(ازد|اصط|اضط)..$", word_nm):
                return True
            elif re.match(u"^ا[^صضطظد]ت..$", word_nm):
                return True
            elif re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            elif re.match(u"^ا.ّ..$", word_nm):
                return True
            elif re.match(u"^ا...ى$", word_nm):
                return True
            else: return False
        elif word_nm.startswith(TEH):
            return True
        else:
            return False

    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
    elif length == 6:
        if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)):
            return False
        if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
        else: return False
    return True