Пример #1
0
    def make_suggestions(self, ):
        """ add suggestion """
        suggestions = ""
        if self.category == u"صفة":
            suggestions = """<match no="1"/>&nbsp;"""
            for sug in self.suggestions:
                sug_tokens = araby.tokenize(sug)
                sug_tokens = [araby.strip_lastharaka(s) for s in sug_tokens]
                tokens = [araby.strip_tashkeel(t) for t in self.pattern]
                if len(tokens) >= 2:
                    match = tokens[1]
                else:
                    match = "TODO"
                if len(sug_tokens) >= 2:
                    suggest = sug_tokens[1]
                else:
                    suggest = sug
                suggestions += u"""   <suggestion><match no="2" regexp_match="%s" regexp_replace="%s"/></suggestion>\n""" % (
                    match, suggest)
        elif self.category in (u"كلمة واحدة", u"فعل"):
            suggestions = ""
            for sug in self.suggestions:
                sug_tokens = araby.tokenize(sug)
                sug_tokens = [araby.strip_lastharaka(s) for s in sug_tokens]
                tokens = [araby.strip_tashkeel(t) for t in self.pattern]
                if len(tokens) >= 1:
                    match = tokens[0]
                else:
                    match = "TODO"
                if len(sug_tokens) >= 1:
                    suggest = sug_tokens[0]
                else:
                    suggest = sug
        elif self.category in (u"متعدي بحرف", u"متعدي إلى مفعولين"):
            match = self.pattern[0]

            # add some suggestions
            suggestions += u"<!-- Verb Intransitive to transitive\n"
            suggestions += u"""<suggestion><match no="1" postag="(V.*a.;..)(-)" postag_replace="$1H">%s</match><match no="2" regexp_match="ل" regexp_replace=""/></suggestion>""" % (
                match)
            suggestions += u"\n-->\n"
            suggestions += u"<!-- Verb Intransitive to transitive, when the preposition letter is attached to a noun\n"
            suggestions += u"""<suggestion><match no="1" postag="(V.*a.;..)(-)" postag_replace="$1H">%s</match>&nbsp;<match no="2" regexp_match="^ب" regexp_replace=""/></suggestion>""" % (
                match)
            suggestions += u"\n-->\n"
            suggestions += u"<!-- Verb Transitive to intransitive\n"
            suggestions += u"""  <suggestion><match no="1"  postag="(V-1.*)(.)" postag_replace="$1-" postag_regexp="yes">%s</match>&nbsp;في<match no="1"  regexp_match="(.*)((&verb_encletics;)$)|(.*)((&verb_encletics;)?$)" regexp_replace="$2"/></suggestion>""" % (
                match)
            suggestions += u"\n-->\n"

        # make suggestion for verb category
        else:
            for sug in self.suggestions:
                suggestions += u"   <suggestion>%s</suggestion>\n" % sug
        return suggestions
Пример #2
0
def vocalize(verb, proclitic, enclitic):
    """
    Join the  verb and its affixes, and get the vocalized form
    @param verb: verb found in dictionary.
    @type verb: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: (vocalized word, semivocalized).
    @rtype: (unicode, unicode).
    """    
    enclitic_voc   =  svconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] 
    enclitic_voc   =  get_enclitic_variant(verb, enclitic_voc)
    proclitic_voc  =  svconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] 
    #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] 
    # لمعالجة حالة ألف التفريق
    if enclitic and verb.endswith(araby.WAW+ araby.ALEF) :
        verb  =  verb[:-1] 
    if enclitic and verb.endswith(araby.ALEF_MAKSURA):
        verb  =  verb[:-1]+araby.ALEF 
        
    vocalized = ''.join([ proclitic_voc, verb , enclitic_voc]) 
    semivocalized = ''.join([ proclitic_voc, araby.strip_lastharaka(verb) , enclitic_voc]) 
    return (vocalized, semivocalized)
Пример #3
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix:
        word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
            
    return word_stem
Пример #4
0
def vocalize(verb, proclitic, enclitic):
    """
    Join the  verb and its affixes, and get the vocalized form
    @param verb: verb found in dictionary.
    @type verb: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: (vocalized word, semivocalized).
    @rtype: (unicode, unicode).
    """
    enclitic_voc = SVC.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc = get_enclitic_variant(verb, enclitic_voc)
    proclitic_voc = SVC.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]
    # لمعالجة حالة ألف التفريق
    if enclitic and verb.endswith(ar.WAW + ar.ALEF):
        verb = verb[:-1]
    if enclitic and verb.endswith(ar.ALEF_MAKSURA):
        verb = verb[:-1] + ar.ALEF

    vocalized = ''.join([proclitic_voc, verb, enclitic_voc])
    semivocalized = ''.join(
        [proclitic_voc, ar.strip_lastharaka(verb), enclitic_voc])
    return (vocalized, semivocalized)
Пример #5
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    # print word.encode('utf8')
    #HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, 
    # KASRATAN, FATHATAN)
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
    araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
        word_stem = word_stem[:-1]
    elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.TEH
    elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
Пример #6
0
def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix  #default value
    #if the word ends by a haraka
    if not enclitic_nm and word[-1:] in (
            araby.ALEF_MAKSURA, araby.YEH,
            araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""

    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is
    # changed and can be not found in table
    if u'متحرك' in ssconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix

    return newsuffix, suffix_non_irab_mark
Пример #7
0
def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""        
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark 
Пример #8
0
def get_word_variant(word, suffix, encletic):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( first level).
    @type suffix: unicode.
    @param encletic: encletic( second level).
    @type encletic: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word

    suffix_nm = araby.strip_tashkeel(suffix)

    encletic_nm = araby.strip_tashkeel(encletic)
    long_suffix_nm = suffix_nm + encletic_nm
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)

    # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية
    if word_stem.endswith(araby.TEH_MARBUTA):
        if suffix_nm in (araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA,
                         araby.YEH, araby.YEH + araby.ALEF + araby.TEH):
            word_stem = word_stem[:-1]
        # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق
        #مدرسة +ين = مدرستين
        elif long_suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.TEH

    elif word_stem.endswith(araby.ALEF_MAKSURA):
        # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء
        # مستوى +ان = مستويان
        # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء
        if suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.YEH
    # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل  تتحول الألف المقصورة إلى ألف
        elif encletic_nm != u"":
            word_stem = word_stem[:-1] + araby.ALEF
    elif word_stem.endswith(araby.KASRA + araby.YEH):
        # الاسم المنقوص ينتهي بياء قبلها مكسور
        # إذا كان لا ضمير واللاحقة فقط حركات
        # نحذف ال
        if not encletic_nm and not suffix_nm:
            word_stem = word_stem[:-2]

    #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA)
              or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA)
              ) and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
    return word_stem
Пример #9
0
def get_word_variant(word, suffix, encletic):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( first level).
    @type suffix: unicode.
    @param encletic: encletic( second level).
    @type encletic: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    
    suffix_nm = araby.strip_tashkeel(suffix)

    encletic_nm = araby.strip_tashkeel(encletic)
    long_suffix_nm = suffix_nm + encletic_nm 
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)
    
    # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية
    if word_stem.endswith(araby.TEH_MARBUTA):
        if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
            word_stem = word_stem[:-1]
        # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق
        #مدرسة +ين = مدرستين
        elif long_suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.TEH
       

    elif word_stem.endswith(araby.ALEF_MAKSURA):
        # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء
        # مستوى +ان = مستويان        
 # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء
         if suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.YEH
        # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل  تتحول الألف المقصورة إلى ألف
         elif encletic_nm != u"":
            word_stem = word_stem[:-1]+araby.ALEF 
    elif word_stem.endswith(araby.KASRA + araby.YEH):
     # الاسم المنقوص ينتهي بياء قبلها مكسور
     # إذا كان لا ضمير واللاحقة فقط حركات
     # نحذف ال
         if not encletic_nm  and not suffix_nm :
            word_stem = word_stem[:-2] 

        #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية         
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
Пример #10
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)

	# تحويل الألف المقصورة إلى ياء في مثل إلى => إليك
    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm :
        if word_stem == u"سِوَى":
            word_stem = word_stem[:-1]+araby.ALEF
        else: 
            word_stem = word_stem[:-1]+araby.YEH + araby.SUKUN
	# تحويل الهمزة حسب موقعها           
    elif word_stem.endswith(araby.HAMZA) and suffix_nm :
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA




	# this option is not used with stop words, because most of them are not inflected مبني
    #if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix and suffix[0] in araby.HARAKAT:
        word_stem = araby.strip_lastharaka(word_stem)


	# الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا 
    if suffix.startswith(araby.NOON) and word.endswith(araby.NOON + araby.SUKUN) :
        word_stem = araby.strip_lastharaka(word_stem)
    elif suffix.startswith(araby.KASRA + araby.YEH) and word.endswith(araby.YEH + araby.SUKUN)  :
        word_stem = araby.strip_lastharaka(word_stem)
         
    return word_stem
Пример #11
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)

    # تحويل الألف المقصورة إلى ياء في مثل إلى => إليك
    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm:
        if word_stem == u"سِوَى":
            word_stem = word_stem[:-1] + araby.ALEF
        else:
            word_stem = word_stem[:-1] + araby.YEH + araby.SUKUN
# تحويل الهمزة حسب موقعها
    elif word_stem.endswith(araby.HAMZA) and suffix_nm:
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA

# this option is not used with stop words, because most of them are not inflected مبني
#if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix and suffix[0] in araby.HARAKAT:
        word_stem = araby.strip_lastharaka(word_stem)

# الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا
    if suffix.startswith(
            araby.NOON) and word.endswith(araby.NOON + araby.SUKUN):
        word_stem = araby.strip_lastharaka(word_stem)
    elif suffix.startswith(araby.KASRA +
                           araby.YEH) and word.endswith(araby.YEH +
                                                        araby.SUKUN):
        word_stem = araby.strip_lastharaka(word_stem)

    return word_stem
Пример #12
0
    def test_strip(self):

        # strip_harakat(text):
        assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة'

        # strip_lastharaka(text)
        assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة'

        # strip_tashkeel(text)
        assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية'

        # strip_tatweel(text):
        assert Araby.strip_tatweel(u"العـــــربية") == u'العربية'

        # strip_shadda(text):
        assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
Пример #13
0
    def get_suffix_variants(word, suffix, enclitic, mankous=False):
        """
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, enclitic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param mankous: if the noun is mankous ends with Yeh منقوص.
        @type mankous: boolean.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        """
        #enclitic_nm = ar.strip_tashkeel(enclitic)
        enclitic_nm = enclitic  # given enclitic is not vocalized
        newsuffix = suffix  #default value
        #if the word ends by a haraka
        if suffix.find(ar.TEH_MARBUTA) >= 0 and enclitic_nm:
            newsuffix = re.sub(ar.TEH_MARBUTA, ar.TEH, suffix)

        elif not enclitic_nm and ar.is_haraka(suffix):
            if word[-1:] in (ar.YEH, ar.ALEF):
                newsuffix = u""
            elif mankous:
                # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل
                # تحول حركته إلى تنوين كسر
                newsuffix = ar.KASRATAN
        #gererate the suffix without I'rab short mark
        # here we lookup with given suffix because the new suffix is
        # changed and can be not found in table
        if u'متحرك' in SNC.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
            suffix_non_irab_mark = ar.strip_lastharaka(newsuffix)
        else:
            suffix_non_irab_mark = newsuffix
            
        #~ if suffix.endswith(ar.YEH+ar.SHADDA+ ar.DAMMA) and enclitic_nm.startswith(ar.YEH):
            #~ newsuffix = ar.YEH+ar.SHADDA+ ar.DAMMA
            #~ suffix_non_irab_mark = ar.YEH+ar.SHADDA            
        #~ if suffix.endswith(ar.DAMMA) and enclitic_nm.startswith( ar.YEH):
            #~ newsuffix = suffix[:-1] + ar.KASRA
            #~ suffix_non_irab_mark = suffix[:-1]            
        return newsuffix, suffix_non_irab_mark
Пример #14
0
    def compare(self, baseline, vocalized_output):
        """
        compare base line with automatic vocalized result
        """
        myconsole.lineCorrect = 0
        myconsole.lineWLMIncorrect = 0
        inputVocalizedLine = baseline

        inputlist = araby.tokenize(inputVocalizedLine)
        if type(vocalized_output) == list:
            outputlist = [x.get("chosen", '') for x in vocalized_output]
            result = vocalized_output
            outputlistsemi = [x.get("semi", '') for x in vocalized_output]
        elif type(vocalized_output) == str:
            outputlist = araby.tokenize(vocalized_output)
            outputlistsemi = [araby.strip_lastharaka(x) for x in outputlist]
        else:
            print("Incompatible  vocaluzed output, must be dict or string",
                  type(vocalized_output), vocalized_output)
            sys.exit()

        self.total += len(inputlist)
        self.lineTotal = len(inputlist)
        if len(inputlist) != len(outputlist):
            print("lists haven't the same length")
            print(len(inputlist), len(outputlist))
            print(u"# ".join(inputlist).encode('utf8'))
            print(u"# ".join(outputlist).encode('utf8'))
        else:
            for inword, outword, outsemiword in zip(inputlist, outputlist,
                                                    outputlistsemi):
                simi = araby.vocalized_similarity(inword, outword)
                if simi < 0:
                    self.LettersError += -simi
                    self.incorrect += 1
                    # evaluation without last haraka
                    simi2 = araby.vocalized_similarity(inword, outsemiword)
                    if simi2 < 0:
                        self.WLMIncorrect += 1
                        self.lineWLMIncorrect += 1
                else:
                    self.correct += 1
                    self.lineCorrect += 1
        self.counter += 1
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
Пример #16
0
 def vocalize(self,verb, proclitic, enclitic):
     """
     Join the  verb and its affixes, and get the vocalized form
     @param verb: verb found in dictionary.
     @type verb: unicode.
     @param proclitic: first level prefix.
     @type proclitic: unicode.
     @param enclitic: first level suffix.
     @type enclitic: unicode.
     @return: (vocalized word, semivocalized).
     @rtype: (unicode, unicode).
     """
     #~ print(verb.encode('utf8'))
     # لمعالجة حالة ألف التفريق
     if enclitic and verb.endswith(ar.WAW + ar.ALEF):
         verb = verb[:-1]
     # حالة مشَوْا
     if enclitic and verb.endswith(ar.WAW + ar.SUKUN + ar.ALEF):
         verb = verb[:-1]
     if enclitic and verb.endswith(ar.ALEF_MAKSURA):
         verb = verb[:-1] + ar.ALEF
     if enclitic and verb.endswith(ar.TEH+ar.DAMMA + ar.MEEM+ ar.SUKUN):
         verb  = verb[:-1] + ar.DAMMA + ar.WAW
     if enclitic and verb.endswith(ar.TEH+ar.DAMMA + ar.MEEM):
         verb += ar.DAMMA + ar.WAW
     word_tuple_list =[]
     #~ enclitic_voc = SVC.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
     #~ enclitic_voc = self.get_enclitic_variant(verb, enclitic_voc)
     #~ proclitic_voc = SVC.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
     #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]
         
     for proclitic_voc in SVC.COMP_PREFIX_LIST_TAGS.get(proclitic, {}).get("vocalized", ''):
         for enclitic_voc in SVC.COMP_SUFFIX_LIST_TAGS.get(enclitic, {}).get("vocalized", ''):
             enclitic_voc = self.get_enclitic_variant(verb, enclitic_voc)
             vocalized = ''.join([proclitic_voc, verb, enclitic_voc])
             semivocalized = ''.join(
         [proclitic_voc, ar.strip_lastharaka(verb), enclitic_voc])
             word_tuple_list.append((vocalized, semivocalized))
     return word_tuple_list
Пример #17
0
def get_suffix_variants(word, suffix, enclitic, mankous = False):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @param mankous: if the noun is mankous ends with Yeh منقوص.
    @type mankous: boolean.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and  araby.is_haraka(suffix):
        if word[-1:] in (araby.YEH, araby.ALEF):
            newsuffix = u""
        elif mankous :
            # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل
            # تحول حركته إلى تنوين كسر
             newsuffix =  araby.KASRATAN
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark