예제 #1
0
    def __init__(self, result_dict = None, order = -1):
        # ToDo
        # copy the super class attributes to curesult_dictrrent classe
        #stemmedword.stemmedWord.__init__(self, result_dict.get_dict())
        
        if result_dict: 
            self.__dict__ = result_dict.__dict__.copy()
            self.unvocalized =  araby.strip_tashkeel(self.vocalized)
            self.unvoriginal =  araby.strip_tashkeel(self.original)
        self.tag_verbal_factor  =   0
        self.tag_nominal_factor =   0
        self.tag_kana_rafe3     =   False 
        if self.is_verb():
            self.tag_kana_rafe3 =   self._is_kana_rafe3() 
        if self.is_stopword():
            self.tag_kana_rafe3 =   self._is_kana_rafe3()  
            self.tag_nominal_factor = self.__get_nominal_factor()
            #verbal factor
            self.tag_verbal_factor  = self.__get_verbal_factor()

        self.tag_addition =  self._is_addition()                

        self.tag_break =  self._is_break() 
        self.forced_word_case = False
        self.syntax =  u""   # used for syntaxique analysis porpos
        self.semantic =  u""  # used for semantic analysis porposes
        self.forced_wordtype = False        
        self.order =  order
        self.next =  {}
        self.previous =  {}
        self.sem_next =  {}
        self.sem_previous =  {}
        self.score =  0
        self.rule = 0  # rule used to select the current case in vocalization
예제 #2
0
def check_normalized(word_vocalised, resulted_data):
    """
    If the entred word is like the found word in dictionary, 
    to treat some normalized cases, 
    the analyzer return the vocalized like words
    ُIf the word is ذئب, the normalized form is ذءب, 
    which can give from dictionary ذئبـ ذؤب.
    this function filter normalized resulted word according 
    the given word, and give ذئب.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.
    """
    #print word_vocalised.encode('utf8')
    filtred_data = []
    inputword = araby.strip_tashkeel(word_vocalised)
    for item in  resulted_data:
        if 'vocalized' in item.__dict__ : #.has_key('vocalized') :
        #~ if 'vocalized' in item :
            #~ outputword = araby.strip_tashkeel(item['vocalized'])
            outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
            #print u'\t'.join([inputword, outputword]).encode('utf8')
            if inputword == outputword:
                #item['tags'] += ':a'
                filtred_data.append(item)
    return  filtred_data
예제 #3
0
def get_word_variant(word, suffix, encletic):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( first level).
    @type suffix: unicode.
    @param encletic: encletic( second level).
    @type encletic: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    
    suffix_nm = araby.strip_tashkeel(suffix)

    encletic_nm = araby.strip_tashkeel(encletic)
    long_suffix_nm = suffix_nm + encletic_nm 
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)
    
    # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية
    if word_stem.endswith(araby.TEH_MARBUTA):
        if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
            word_stem = word_stem[:-1]
        # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق
        #مدرسة +ين = مدرستين
        elif long_suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.TEH
       

    elif word_stem.endswith(araby.ALEF_MAKSURA):
        # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء
        # مستوى +ان = مستويان        
 # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء
         if suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.YEH
        # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل  تتحول الألف المقصورة إلى ألف
         elif encletic_nm != u"":
            word_stem = word_stem[:-1]+araby.ALEF 
    elif word_stem.endswith(araby.KASRA + araby.YEH):
     # الاسم المنقوص ينتهي بياء قبلها مكسور
     # إذا كان لا ضمير واللاحقة فقط حركات
     # نحذف ال
         if not encletic_nm  and not suffix_nm :
            word_stem = word_stem[:-2] 

        #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية         
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
예제 #4
0
def detect_number_phrases_position(wordlist):
    """
    Detect number words in a text and return positions of each phrase.

    Example:
        >>> txt = u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا"
        >>> wordlist = araby.tokenize(txt)
        >>> positions_phrases = detect_number_phrases_position(wordlist)
        >>> print positions_phrase
        >>> print positions_phrases
        [(1, 3), (6, 7)]

    @param wordlist: wordlist
    @type wordlist: unicode list
    @return: list of numbers clause positions [(start,end),(start2,end2),]
    @rtype: list of tuple
    """
    #~ wordlist# = text.split(u' ')
    #print words
    phrases = []
    startnumber = -1
    endnumber = False
    #~ taglist = []
    for i, word in enumerate(wordlist):
        #~ word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else: nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes
        if word_nm and not startnumber and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if key in nbconst.NUMBER_WORDS or key.isnumeric():
            if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر', u'عشرة'):
                if startnumber < 0:
                    startnumber = i
                endnumber = i
            # phrase.append(word)
        else:
            if startnumber >= 0: #There are a previous number phrase.
                phrases.append((startnumber, endnumber))
            startnumber = -1
    # add the final phrases
    if startnumber >= 0: #There are a previous number phrase.
        phrases.append((startnumber, endnumber))

    return phrases
예제 #5
0
    def is_possible_collocation(self, list2, context = "", lenght = 2):
        """
        Guess if the given list is a possible collocation
        This is used to collect unkown collocations, from user input
        return True oor false
        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @param lenght: minimum number of words in the collocation
        @type lenght: integer.        
        @return : the rule of found collocation, 100 default.
        @rtype: interger.
        """        
        if len(list2)<lenght:
            return 0
        else:
            item_v1 = list2[0]
            item_v2 = list2[1]
            item1 = araby.strip_tashkeel(item_v1)
            item2 = araby.strip_tashkeel(item_v2)        
            #if item1[-1:] in (u".", u"?", u", ", u'[', u']', u'(', ')'):
            #    return 0
            if  not cconst.token_pat.search(item1) or not \
            cconst.token_pat.search(item2) :
                return -1
            #else: return 100
            elif item1 in cconst.ADDITIONAL_WORDS :
                return 10
            elif item1 in cconst.NAMED_PRIOR :
                return 15            
            elif (item2 not in cconst.SPECIAL_DEFINED):
                if  item2.startswith(u'ال') and  item1.startswith(u'ال'):
                    return 20
                elif item1.endswith(u'ة') and item2.startswith(u'ال'):
                    return 30

                #حالة الكلمات التي تبدأ بلام الجر والتعريف 
                # لا داعي لها لأنها دائما مجرورة
                #if  item2.startswith(u'لل'):
                #    return 40
                elif item1.endswith(u'ة') and item2.endswith(u'ة')  :
                    return 40
                #if item1.endswith(u'ي') and item2.endswith(u'ي'):
                #    return 60

                elif  context != u"" and context in cconst.tab_noun_context \
                and item2.startswith(u'ال') :
                    return 50
                #return True

                elif item1.endswith(u'ات') and item2.startswith(u'ال') :
                    return 60
            return 100
예제 #6
0
def detect_numbers(wordlist):
    """
    Detect number words in a text and return a taglist as BIO.

    Example:
        >>> wordlist = araby.tokenize(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
        >>> detect_numbers(wordlist)
        ['DO', 'DB', 'DI', 'DI', 'DO', 'DO', 'DB', 'DI', 'DO']

    @param wordlist: wordlist
    @type wordlist: unicode list
    @return: list of tags BIO
    @rtype: list of unicode
    """
    #~ phrases = []
    starts = False
    taglist = []

    for i, word in enumerate(wordlist):
        #~ word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else:
            nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes
        if word_nm and not starts and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if key in nbconst.NUMBER_WORDS or key.isnumeric():
            if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر', u'عشرة'):
                if not starts:
                    taglist.append("DB")
                    starts = True
                else:
                    taglist.append("DI")
            else:
                starts = False
                taglist.append("O")
        else:
            starts = False
            taglist.append("O")
    return taglist
예제 #7
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix:
        word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
            
    return word_stem
예제 #8
0
 def search_arabic(self, q, fetch_subgraph = True, limit = DEFAULT_LIMIT,
                   fetchplan = DEFAULT_FETCHPLAN):
     """
     Searches for given label intelligently handling vocalization.
     (This does not make much sense without a fetchplan as you will get
     index nodes only.)
     
     """
     # If query is not vocalized, search unvocalized index and eventually
     # return subtree
     if not araby.is_vocalized(q):
         return self.search_index(q, fetch_subgraph,
                                  "ArabicNode.unvocalized_label", limit,
                                  fetchplan)
         
     # If it is vocalized, search unvocalized index and check for
     # "compatibility" of vocalization
     matches = self.search_index(araby.strip_tashkeel(q), False,
                                 "ArabicNode.unvocalized_label", limit)
     rids = [n.rid for n in matches.primary_results
             if Tools.is_vocalized_like(q, n.data["label"])]
     # Ignore vocalization if there is no compatible one
     if not rids:
         rids = [n.rid for n in matches.primary_results]
     return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
예제 #9
0
파일: adaat.py 프로젝트: linuxscout/mishkal
def wordtag(text):
    """
    word tagginginto noun, verb, tool
    """
    import naftawayh.wordtag
    tagger = naftawayh.wordtag.WordTagger()
    word_list = token_text(text)

    if len(word_list) == 0:
        return []
    else:
        list_result = []
        second_previous =""
        previous = u""
        #~previous_tag  =  ""        
        for word in word_list:
            word_nm = araby.strip_tashkeel(word)
            tag = ''
            if tagger.is_stopword(word):
                tag = 't'
            else:
                if tagger.is_noun(word):
                    tag += 'n'
                if tagger.is_verb(word):
                    tag += 'v'
                if tag in ("", "nv"):
                    tag = tagger.context_analyse(previous, word)+"1"
                    if tag in ("", "nv1", "vn1"):
                        tag = tagger.context_analyse(u" ".join([second_previous, previous]), word)+"2"                    
            list_result.append({'word':word, 'tag': tag})
            second_previous = previous
            previous = word_nm
            #~previous_tag  =  tag
        return list_result
예제 #10
0
def detect_number_words(text):
    """
    Detect number words in a text.
    @param text: input text
    @type text: unicode
    @return : number words extracted from text
    @rtype: integer
    >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
    خمسمئة وثلاثة وعشرين
    """

    #~ words = araby.tokenize(text)
    #print words
    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            vocalized = vocalize_number(araby.strip_tashkeel(\
            numberedwords).split(' '), tags)                
            #calcul  vocalization similarity : 
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, \
                nextword)                    
            if sim < 0:
                print u'\t'.join([str(sim), numberedwords, vocalized, \
                 str(numeric), u' '.join([previous, phrase, nextword]), \
                  nextword, voc_unit, str(sim_unit)]).encode('utf8')
예제 #11
0
파일: stem_noun.py 프로젝트: tazjel/mishkal
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    # print word.encode('utf8')
    #HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, 
    # KASRATAN, FATHATAN)
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
    araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
        word_stem = word_stem[:-1]
    elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.TEH
    elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
예제 #12
0
파일: stem_noun.py 프로젝트: tazjel/mishkal
def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""        
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark 
예제 #13
0
파일: named.py 프로젝트: assem-ch/mishkal
def vocalize_named(wordlist, syn_tags = ""):
    """ Vocalize a number words
    @param wordlist: words to vocalize
    @type wordlist: unicode list
    @param syn_tags: tags about the clause
    @type syn_tags: unicode
    @return: the vocalized wordlist.
    @rtype: unicode
    """
    newlist = []    
    #~ prefix = u""    
    #~ nextword = u""    
    #detect tags 
    # we can pass tags to this number word
    tags =  syn_tags    
    bin_count = 0    
    for i in range(len(wordlist)):
        #save the original word with possible harakat if exist
        word = wordlist[i]    
        word_nm = araby.strip_tashkeel(word)    
        # the first word can have prefixes 
        if i == 0 and word_nm:  
            # word to get majrour tag
            if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
                tags += u"مجرور"    
            elif word_nm in (u'أبو', ):
                tags += u"مرفوع"    
            elif word_nm in (u'أبا', ):
                tags += u"منصوب"    
예제 #14
0
 def set_vocalized(self, newvocalized):
     """
     Set the vocalized word
     @param newvocalized: the new given vocalized.
     @type newvocalized: unicode string
     """
     self.vocalized = newvocalized
     self.unvocalized = araby.strip_tashkeel(newvocalized)
예제 #15
0
def detect_number_phrases_position(wordlist):
    """
    Detect number words in a text and return positions of each phrase.
    @param wordlist: wordlist
    @type wordlist: unicode list
    @return : list of numbers clause positions [(start,end),(start2,end2),]
    @rtype: list of tuple
    >>> detect_number_phrases_position(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
    (1،3)، (6،7)
    """
    #~ wordlist# = text.split(u' ')
    #print words
    phrases = []
    startnumber = -1
    endnumber = False
    taglist = []
    for i in range(len(wordlist)):
        word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else: nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes 
        if word_nm and not startnumber and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if nbconst.NumberWords.has_key(key):
            if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني',  u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر',  u'عشرة'):
                if startnumber < 0:
                    startnumber = i
                endnumber = i
            # phrase.append(word)
        else:
            if startnumber >= 0: #There are a previous number phrase.
                phrases.append((startnumber, endnumber))
            startnumber = -1
    # add the final phrases 
    if startnumber >= 0: #There are a previous number phrase.
        phrases.append((startnumber, endnumber))

    return phrases
예제 #16
0
def detect_numbers(wordlist):
    """
    Detect number words in a text and return a taglist as BIO.
    @param wordlist: wordlist
    @type wordlist: unicode list
    @return : list of tags BIO
    @rtype: list of unicode
    >>> detect_numbers(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
    ['O', 'B', 'I', 'I', 'O', 'O', 'B', 'I', 'O']
    """
    phrases = []
    starts = False
    taglist = []
       
    for i in range(len(wordlist)):
        word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else: 
            nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes 
        if word_nm and not starts and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if nbconst.NumberWords.has_key(key):
            if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني',  u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر',  u'عشرة'):
                if not starts:
                    taglist.append("DB")
                    starts = True
                else:
                    taglist.append("DI")
            else:
                starts = False
                taglist.append("DO")       
        else:
            starts = False
            taglist.append("DO")
    return taglist
예제 #17
0
파일: wordtag.py 프로젝트: assem-ch/mishkal
    def is_stopword(self, word):
        """
        Return True if the word is a stopword, according a predefined list.
        @param word: the previous word.
        @type word: unicode.

        @return: is the word a stop word
        @rtype: Boolean
        """
        return  stopwords.STOPWORDS.has_key(word) or stopwords.STOPWORDS.has_key(araby.strip_tashkeel(word))
예제 #18
0
 def lookup(self, word):
     """
     look up for a vocalization given by user for unrecongnized words
     @return: vocalized word
     @rtype: list of unicode
     """
     word = araby.strip_tashkeel(word)
     if word in self.dictio:
         return self.dictio[word]
     else:
         return [word, ]
예제 #19
0
 def add_entry(self, entry):
     """
     Add lexicon entry to lexicon. Updates "lookup table" 
     entries_by_surface_form.
     
     """
     self.entries.append(entry)
     self.roots.add(entry.root)
     for s in entry.get_surface_forms():
         # Add vocalized and non-vocalized forms
         self.entries_by_surface_form[s].append(entry)
         self.entries_by_surface_form[araby.strip_tashkeel(s)].append(entry)
예제 #20
0
 def add(self, word, suggestList):
     if word!=u"" and  suggestList!=[] and  type(suggestList).__name__=='list': 
         #ToDo: adding different suggestion into one list;
         # NB: this is time eater because if the word is frequent.
         # if self.dict.has_key(word):
             # # if the dict has previous suggestions for the word,
             # # add new suggestions and remove duplicata;
             # suggestList+=self.dict[word];
             # suggestList=set(suggestList);
             # self.dict[word]=suggestList;
         #else:
         self.dict[araby.strip_tashkeel(word)]=suggestList;
예제 #21
0
def validate_tags(stop_tuple, affix_tags, procletic, encletic_nm ,
 suffix_nm):
    """
    Test if the given word from dictionary is compabilbe with affixes tags.
    @param stop_tuple: the input word attributes given from dictionary.
    @type stop_tuple: dict.
    @param affix_tags: a list of tags given by affixes.
    @type affix_tags:list.
    @param procletic: first level prefix vocalized.
    @type procletic: unicode.        
    @param encletic_nm: first level suffix vocalized.
    @type encletic_nm: unicode.
    @param suffix_nm: first level suffix vocalized.
    @type suffix_nm: unicode.        
    @return: if the tags are compaatible.
    @rtype: Boolean.
    """
    procletic = araby.strip_tashkeel(procletic)
    encletic = encletic_nm
    suffix = suffix_nm

    if u"تعريف" in affix_tags and not stop_tuple['definition']:
        return False;
    if u"تعريف" in affix_tags and stop_tuple['defined']:
        return False;        
        #~preposition 
    if  u':جر:'in affix_tags and not stop_tuple['preposition']:
        return False 
    if  u':جر:'in affix_tags and not stop_tuple['preposition']:
        return False 
    if u"متحرك" in affix_tags  and  not stop_tuple['is_inflected']:
        return False  
    if u"مضاف" in affix_tags and not stop_tuple['pronoun']:
        return False 
    if u"مضاف" in affix_tags and stop_tuple['defined']:
        return False 
        #~interrog
    if u"استفهام" in affix_tags and not stop_tuple['interrog']:
        return False          
        #~conjugation                   
        #~qasam 
    if u"قسم" in affix_tags and not stop_tuple['qasam']:
        return False           
        #~
        #~defined 
        #~is_inflected  
        #~tanwin
    if u"تنوين" in affix_tags and not stop_tuple['tanwin']:
        return False        
        #~action  
        #~object_type  
        #~need 
    return True
예제 #22
0
    def search(self, citation_form):
        """
        Seach for given citation form. If not found try again without diacritcs.
        
        Returns a list of ArabicDictionaryEntries. 
        """
        if citation_form not in self.entries_by_surface_form:
            citation_form = araby.strip_tashkeel(citation_form)

        try:
            return self.entries_by_surface_form[citation_form]
        except IndexError:
            return []
예제 #23
0
def main():
    for table in tags.keys():
        for word in eval(table):
            word_nm = araby.strip_tashkeel(word)
            if not word_nm in factor_table:
                factor_table[word_nm]={word:[tags.get(table, ""),], }
            else:
                if not word in factor_table[word_nm]:
                    factor_table[word_nm][word]=[tags.get(table, ""), ]
                else:
                    factor_table[word_nm][word].append(tags.get(table, ""))
                    
    for item in factor_table:
        print (u"u'%s':"%item).encode("utf8"), utf8repr.repr(factor_table[item]).encode('utf8'), ','
예제 #24
0
def vocalize_named(wordlist, syn_tags = ""):
    """ Vocalize a number words
    @param wordlist: words to vocalize
    @type wordlist: unicode list
    @param syn_tags: tags about the clause
    @type syn_tags: unicode
    @return: the vocalized wordlist.
    @rtype: unicode
    """
    newlist = []    
    #~ prefix = u""    
    #~ nextword = u""    
    #detect tags 
    # we can pass tags to this number word
    tags =  syn_tags    
    bin_count = 0    
    for i in range(len(wordlist)):
        #save the original word with possible harakat if exist
        word = wordlist[i]    
        word_nm = araby.strip_tashkeel(word)    
        # the first word can have prefixes 
        if i == 0 and word_nm:  
            # word to get majrour tag
            if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
                tags += u"مجرور"    
            elif word_nm in (u'أبو', ):
                tags += u"مرفوع"    
            elif word_nm in (u'أبا', ):
                tags += u"منصوب"    
        # select vocalization
        if word_nm == u'بن':
            bin_count += 1    
            #treat first bin according to tags
            if bin_count == 1:
                if u'مجرور' in tags:
                    voc = u'بْنِ'
                elif u'مرفوع' in tags:
                    voc = u'بْنُ'
                elif u'منصوب' in tags:
                    voc = u'بْنَ'
                else:
                    voc = u'بْن'
            else:
                #  u'مجرور' 
                voc = u'بْنِ'
        #Todo Vocalize names
        else:
            voc = word    
        newlist.append(voc)    
    return newlist    
예제 #25
0
 def get_unvocalized(self, ):
     """
     Get the unvocalized form of the input word
     @return: the given unvocalized.
     @rtype: unicode string
     """
     if self.unvocalized:
         return self.unvocalized
     else:
         if self.vocalized:
             self.unvocalized = araby.strip_tashkeel(self.vocalized)
         else :
             return u""
     return self.unvocalized
예제 #26
0
 def get_unvoriginal(self, ):
     """
     Get the unvocalized  original form of the input word
     @return: the given unvocalized original.
     @rtype: unicode string
     """
     if self.unvoriginal:
         return self.unvoriginal            
     else :
         if self.original:
             self.unvoriginal =  araby.strip_tashkeel(self.original)
         else:
             return u""
         return self.unvoriginal
예제 #27
0
def get_stem_variants(stem, prefix, suffix):
    """
    Generate the Noun stem variants according to the affixes.
    For example مدرستي = >مدرست+ي  = > مدرسة +ي.
    Return a list of possible cases.
    @param stem: the input stem.
    @type stem: unicode.
    @param prefix: prefixe.
    @type prefix: unicode.
    @param suffix: suffixe.
    @type suffix: unicode.
    @return: list of stem variants.
    @rtype: list of unicode.
    """
    #some cases must have some correction
    #determinate the prefix and suffix types
    # create a list, the first item is the verb without changes
    prefix_possible_noun_list = set([stem])
    # Prefix
    prefix = araby.strip_tashkeel(prefix) 
    suffix = araby.strip_tashkeel(suffix) 
    possible_noun_list = prefix_possible_noun_list 
    if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,
        araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
        possible_noun = stem+araby.TEH_MARBUTA 
        possible_noun_list.add(possible_noun)
    if suffix  ==   "" or suffix  ==   araby.YEH+araby.NOON or \
       suffix  ==   araby.WAW+araby.NOON:
        possible_noun = stem+araby.YEH 
        possible_noun_list.add(possible_noun)
    if stem.endswith(araby.YEH):
        possible_noun = stem[:-1]+araby.ALEF_MAKSURA 
        possible_noun_list.add(possible_noun)
    #to be validated
    validated_list = possible_noun_list 
    return validated_list
예제 #28
0
def get_future_type(word):
    word_nm = araby.strip_tashkeel(word)
    v = word
    if len(word_nm) != 3:
        return araby.FATHA
    elif word_nm[1] == araby.ALEF:
        v = word_nm[0]+araby.FATHA + araby.ALEF + word_nm[2] +araby.FATHA
    elif word_nm.startswith(araby.ALEF_MAKSURA):
        v = word_nm[0]+araby.FATHA + word_nm[1] + araby.FATHA +word_nm[2]

    for i in (1,2,3,4,5,6):
        v2 = v+str(i)
        if v2 in triverbtable.TriVerbTable:
            return triverbtable.TriVerbTable[v2]['haraka']
    return araby.FATHA
예제 #29
0
def lookup(s):
    """
    Looks up a unicode string s in Langenscheidt online dict
    Returns a list of unicode tuples (lemma, transcription, translations)
    
    """
    # Stupid langenscheid cannot handle tashkeel ...
    s = araby.strip_tashkeel(s).encode("utf-8")
    url = "http://de.langenscheidt.com/arabisch-deutsch/%s" % s
    #print "Fetching %s" % url
    doc = urllib.urlopen(url).read()
    root = soupparser.fromstring(doc)
    children = get_children_by_class(root, RESULT_BLOCK_CLASS)
    results = [handle_result_block(e) for e in children]
    return [r for r in results if r]
예제 #30
0
 def create_arabic_node(self, cluster_name, label, **kwargs):
     """
     Checks that label is an arabic string, removes tatweel and normalizes 
     ligatures. Adds unvocalized_label.
     
     """
     label = araby.normalize_ligature(araby.strip_tatweel(label))
     label = label.replace(araby.SMALL_ALEF, "")
     if not araby.is_arabicstring(label):
         raise RuntimeError("'%s' is not an Arabic string" % label)
     
     if "unvocalized_label" not in kwargs:
         kwargs["unvocalized_label"] = araby.strip_tashkeel(label)
     
     return self.create_node(cluster_name, label, **kwargs)
예제 #31
0
파일: named.py 프로젝트: d7eame/pyarabic
def get_previous_tag(word):
    """Get the word tags
    @param word: given word
    @type word: unicode
    @return: word tag
    @rtype: unicode
    """
    word = araby.strip_tashkeel(word)
    #~ tags = u''
    if word in named_const.NOUN_NASEB_LIST:
        return u'منصوب'
    elif word in named_const.JAR_LIST:
        return u'مجرور'
    elif word in named_const.RAFE3_LIST:
        return u'مرفوع'
    else:
        return u''
예제 #32
0
파일: number.py 프로젝트: Guibod/pyarabic
def vocalize_unit(numeric, unit):
    """ Vocalize a number words
    @param numeric: given number
    @type numeric: integer
    @param unit: unit to vocalize
    @type unit: unicode
    @return: the vocalized unit, or unit word if itsnt a unit word.
    @rtype: unicode
    """
    #detect tags
    # The given word is not a unit
    unit_nm = araby.strip_tashkeel(unit)
    if not is_unit(unit_nm):
        return unit
    tags = ""
    vocalizedunit = unit

    # العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها
    # هذه الحالة لا تبرمج

    if numeric >= 0 and numeric <= 2:
        return unit
    # الإضافة إلى تمييز مضاف  إليه مجرور مفرد
    # تممييز الألف والمئة والمليون والمليار
    # يتطلب إضافة إلى مفرد
    # مثلا ألف رجل
    elif numeric % 100 == 0 or numeric % 1000 == 0:
        tags = 'SingleMajrour'
        vocalizedunit = nbconst.UnitWords[unit_nm]['a']
    # العدد المفرد يتطلب
    # إضافة إلى الجمع
    elif numeric % 100 <= 10:
        tags += "Plural"
        vocalizedunit = nbconst.UnitWords[unit_nm]['p']

    elif numeric % 100 < 100:
        tags += 'SingleMansoub'
        vocalizedunit = nbconst.UnitWords[unit_nm]['n']
    else:
        tags = ''
        vocalizedunit = nbconst.UnitWords[unit_nm]['i']
    if not vocalizedunit:
        return 'Error' + tags
    else:
        return vocalizedunit
def clean_str(text): #Option1--> normalizing
    search = [u"أ",u"إ",u"آ",u"ة",u"_",u"-",u"/",u".",u"،",u" و ",u" يا ",u'"',u"ـ",u"'",u"ى",u"\\",u'\n', u'\t',u'&quot;',u'?',u'؟',u'!']
    replace = [u"ا",u"ا",u"ا",u"ه",u" ",u" ",u"",u"",u"",u" و",u" يا",u' " ',u"",u"",u"ي",u"",u' ', u' ',u' ',u' ',u' ',u' ! ']
    text=araby.normalize_ligature(text)
    text=unicodedata.normalize('NFKD',text)
    text=araby.strip_tashkeel(text)#remove tashkeel
    p_longation = re.compile(r'(.)\1+')#remove longation
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    text = text.replace(u'وو', u'و')
    text = text.replace(u'يي', u'ي')
    text = text.replace(u'اا', u'ا')
    for i in range(0, len(search)):
        text = text.replace(unicodedata.normalize('NFKD',search[i]), unicodedata.normalize('NFKD',replace[i]))
    #trim
    text = text.replace(u'ئ', u'ئ')
    text = text.strip()
    return text
예제 #34
0
    def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)

        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(REGEX_URL_STEP1, "[رابط]", text)
            text = re.sub(REGEX_URL_STEP2, "[رابط]", text)
            text = re.sub(REGEX_URL, "[رابط]", text)
            text = re.sub(REGEX_EMAIL, "[بريد]", text)
            text = re.sub(REGEX_MENTION, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]",
                          " [مستخدم] ", text)

        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
                r" \1 ",
                text,
            )
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        text = " ".join(text.split())

        return text
예제 #35
0
    def generate_affix_list(self, vocalized=True):
        """ generate all affixes """
        word = u"قصد"
        # generate all possible word forms
        forms = self.generate_forms(word)
        # remove diacritics
        if not vocalized:
            list_affixes = [araby.strip_tashkeel(d[0]) for d in forms]
        else:
            list_affixes = [d[0] for d in forms]
        # remove duplicated
        list_affixes = list(set(list_affixes))
        # remove stem and get only affixes
        # those variants are used to represent verb vocalizations when conjugation
        variants = [u'قَصَد', u'قْصَد', u"قصد"]
        for word in variants:
            list_affixes = [x.replace(word, '-') for x in list_affixes]

        return list_affixes
예제 #36
0
    def is_stopword(self, word):
        """
        Return True if the word is a stopword, according a predefined list.
        @param word: the previous word.
        @type word: unicode.

        Example:
            >>> import naftawayh.wordtag 
            >>> word_list=(u'بالبلاد', u'بينما', u'أو', u'انسحاب', u'انعدام', 
            u'انفجار', u'البرنامج', u'بانفعالاتها', u'العربي', u'الصرفي', 
            u'التطرف', u'اقتصادي', )
            >>> tagger = naftawayh.wordtag.WordTagger();        
            >>> #test word by word
            >>> for word in word_list:
            >>>     if tagger.is_noun(word):
            >>>         print(u'%s is noun'%word)
            >>>     if tagger.is_verb(word):
            >>>         print(u'%s is verb'%word)
            >>>     if tagger.is_stopword(word):
            >>>         print(u'%s is stopword'%word)
            بالبلاد is noun
            بينما is noun
            بينما is verb
            أو is noun
            أو is verb
            أو is stopword
            انسحاب is noun
            انعدام is noun
            انفجار is noun
            البرنامج is noun
            بانفعالاتها is noun
            العربي is noun
            الصرفي is noun
            التطرف is noun
            اقتصادي is noun

        
        @return: is the word a stop word
        @rtype: Boolean
        """
        word_nm = araby.strip_tashkeel(word)
        return word in stopwords.STOPWORDS or word_nm in stopwords.STOPWORDS
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
예제 #38
0
    def segment(self, word):
        """ generate  a list of  all possible segmentation positions
        (lef, right)  of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'فتضربين'
            >>> print ArListem.segment(word)
            set(([(1, 5), (2, 5), (0, 7)])

        @return: List of segmentation
        @rtype: set of tuple of integer.
        """
        self.word = word
        self.unvocalized = araby.strip_tashkeel(word)
        # word, harakat = araby.separate(word)
        word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)

        # get all lefts position of prefixes
        lefts = self.lookup_prefixes(word)
        # get all rights position of suffixes
        rights = self.lookup_suffixes(word)
        if lefts:
            self.left = max(lefts)
        else:
            self.left = -1
        if rights:
            self.right = min(rights)
        else:
            self.right = -1
        #~ ln = len(word)
        self.segment_list = set([(0, len(word))])
        # print lefts, rights
        for i in lefts:
            for j in rights:
                if j >= i + 2:
                    self.segment_list.add((i, j))
        # filter segment according to valid affixes list

        self.left, self.right = self.get_left_right(self.segment_list)
        return self.segment_list
예제 #39
0
def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)

    # تحويل الألف المقصورة إلى ياء في مثل إلى => إليك
    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm:
        if word_stem == u"سِوَى":
            word_stem = word_stem[:-1] + araby.ALEF
        else:
            word_stem = word_stem[:-1] + araby.YEH + araby.SUKUN
# تحويل الهمزة حسب موقعها
    elif word_stem.endswith(araby.HAMZA) and suffix_nm:
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA

# this option is not used with stop words, because most of them are not inflected مبني
#if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix and suffix[0] in araby.HARAKAT:
        word_stem = araby.strip_lastharaka(word_stem)

# الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا
    if suffix.startswith(
            araby.NOON) and word.endswith(araby.NOON + araby.SUKUN):
        word_stem = araby.strip_lastharaka(word_stem)
    elif suffix.startswith(araby.KASRA +
                           araby.YEH) and word.endswith(araby.YEH +
                                                        araby.SUKUN):
        word_stem = araby.strip_lastharaka(word_stem)

    return word_stem
예제 #40
0
    def get_noun_attributes(self, word):
        """
        return vocalized form
        """
        vocalized = word

        word_nm = araby.strip_tashkeel(word)
        foundlist = self.noun_dict.lookup(word_nm)
        word_tuple_res = None
        for word_tuple in foundlist:
            word_tuple = dict(word_tuple)
            # if found the same vocalization
            word_tuple_res = word_tuple
            break
        else:  # no vocalization, try the first one
            if foundlist:
                word_tuple_res = dict(foundlist[0])
            else:
                word_tuple_res = {"vocalized": word}
        return word_tuple_res
예제 #41
0
def process(text):
	text = araby.strip_tashkeel(text) #delete *tashkil
	
	text = re.sub('\ـ+', ' ', text)  # delete letter madda
	
	text = re.sub('\ر+', 'ر', text)  # duplicate ra2
	text = re.sub('\اا+','ا',text)     #duplicate alif
	text = re.sub('\ووو+','و',text)    #duplicate waw (more than 3 times goes to 1
	text = re.sub('\ههه+','ههه',text)  #duplicate ha2 (more than 3 times goes to 1
	text = re.sub('\ةة+','ة',text)
	text = re.sub('\ييي+','ي',text)
	text = re.sub('أ','ا',text) # after to avoid mixing
	text = re.sub('آ','ا',text) # after to avoid mixing
	text = re.sub('إ','ا',text) # after to avoid mixing
	text = re.sub('ة','ه',text) # after ةة to avoid mixing ههه
	text = re.sub('ى','ي',text)
	
	text = " ".join(text.split()) #delete multispace
	
	return text
예제 #42
0
파일: number.py 프로젝트: Guibod/pyarabic
def detect_number_words(text):
    """
    Detect number words in a text.
    
    Example:
        >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
        خمسمئة وثلاثة وعشرين 
    
    @param text: input text
    @type text: unicode
    @return: number words extracted from text
    @rtype: integer
    """

    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            wordlist = araby.strip_tashkeel(numberedwords).split(' ')
            vocalized = vocalize_number(wordlist, tags)
            #calcul  vocalization similarity :
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, nextword)

            if sim < 0:
                #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized,
                #~ str(numeric), u' '.join([previous, phrase, nextword]),
                #~ nextword, voc_unit, str(sim_unit)]).encode('utf8')
                print('\t'.join(
                    [str(sim), ' '.join(numberedwords),
                     ' '.join(vocalized)]).encode('utf8'))
                print(str(numeric), ' '.join([previous, phrase,
                                              nextword]).encode('utf8'))
                print('\t'.join([nextword, voc_unit,
                                 str(sim_unit)]).encode('utf8'))
예제 #43
0
파일: number.py 프로젝트: Guibod/pyarabic
def text2number(text):
    """
    Convert arabic text into number, for example convert تسعة وعشرون = >29.

    Example:
        >>> text2number(u"خمسمئة وثلاث وعشرون")
        523    
    
    @param text: input text
    @type text: unicode
    @return: number extracted from text
    @rtype: integer
    """
    #the result total is 0
    total = 0
    # the partial total for the three number
    partial = 0
    text = araby.strip_tashkeel(text)
    words = text.split(' ')
    #print words
    for word in words:
        if word and word != 'واحد' and \
           word[0] in ('و', 'ف', 'ل', 'ب', 'ك'):
            word = word[1:]
        if word != 'واحد' and word.startswith('و'):
            word = word[1:]

        if word in nbconst.NumberWords:
            actualnumber = nbconst.NumberWords[word]
            if actualnumber % 1000 == 0:
                # the case of 1000 or 1 million
                if partial == 0:
                    partial = 1
                total += partial * actualnumber
                #re-initiate the partial total
                partial = 0
            else:
                partial += nbconst.NumberWords[word]
    # add the final partial to total
    total += partial
    return total
예제 #44
0
    def get_verb_info(self, verb_tuple):
        """
        Get verb information
        """

        # get verb subclass
        verb_nm = araby.strip_tashkeel(verb_tuple['vocalized'])
        verb_class = ""
        if verb_nm.startswith(araby.WAW):
            verb_class = "W1W"  #"Mithal_W"
        elif verb_nm[-2:-1] == araby.ALEF:  # before last char
            if verb_tuple['future_type'] in (araby.DAMMA, u"ضمة"):
                verb_class = "W2W"  #"Adjwaf_W"
            elif verb_tuple['future_type'] in (araby.KASRA, u"كسرة"):
                verb_class = "W2Y"  #"Adjwaf_Y"
        elif verb_nm[-1:] in (araby.YEH, araby.ALEF_MAKSURA):
            verb_class = "W3Y"  #"Naqis_Y"
        elif verb_nm[-1:] == araby.ALEF:
            verb_class = "W3W"  #"Naqis_W"
        elif araby.SHADDA in (verb_tuple['vocalized']):
            verb_class = "Dbl"  # doubled
        else:
            verb_class = "-"

        # the passive tenses dont take object suffix, only with double transitie verbs
        tags = "V." + verb_class + "."
        if verb_tuple['transitive']:
            tags += "T"
        else:
            tags += "I"

        if verb_tuple['double_trans']:
            tags += "D"
        elif verb_tuple['think_trans']:
            tags += "T"
        elif verb_tuple['reflexive_trans']:
            tags += "R"
        # tags pronouns
        else:
            tags += '-'
        return tags
예제 #45
0
    def _old_preprocess(self, text, do_farasa_tokenization):
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)

        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(regex_url_step1, "[رابط]", text)
            text = re.sub(regex_url_step2, "[رابط]", text)
            text = re.sub(regex_url, "[رابط]", text)
            text = re.sub(regex_email, "[بريد]", text)
            text = re.sub(regex_mention, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]",
                          " [مستخدم] ", text)

        if self.remove_elongation:
            text = self._remove_elongation(text)

        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
                r" \1 ",
                text,
            )
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        return text.strip()
예제 #46
0
def display_word_seg(xmldoc, keep_tashkeel=False):
    """
    extract all possible segmentations
    return a dict of lists, the key is the word
    """
    word_dict = {}
    # get the annuaire list
    words = xmldoc.getElementsByTagName('w')
    #~ print words
    cpt = 0
    # display a word
    for word  in words:
        # every word contains choices
        word_value = word.getAttribute("rend")
        word_dict[word_value] = []
        choices = word.getElementsByTagName('choice')
        for choice in choices:
            #~ print choice.toxml()
            segs = choice.getElementsByTagName('seg')
            for seg in segs:
                #~ print seg.toxml()
                members = choice.getElementsByTagName('m')
                #~ segment={"word":word_value, }
                segment={}
                for mmbr in members:
                    mmbr_type = mmbr.getAttribute('type')
                    try:
                        mmbr_value = mmbr.firstChild.data
                    except:
                        mmbr_value = ""
                    if keep_tashkeel:
                        segment[mmbr_type] = mmbr_value
                    else:
                        segment[mmbr_type] = ar.strip_tashkeel(mmbr_value)

                word_dict[word_value].append(segment)
        # strip tashkeel generate duplicates segments
        if not keep_tashkeel:
            word_dict[word_value] = remove_duplicate(word_dict[word_value])
                #~ print (repr(segment)).decode('unicode-escape');
    return word_dict
예제 #47
0
def preprocess(text, do_farasa_tokenization=False):
	text=str(text)
	processing_tweet = araby.strip_tashkeel(text)
	processing_tweet = re.sub(r'\d+\/[ء-ي]+\/\d+\]', '', processing_tweet)
	#processing_tweet = re.sub(r'\d+([,\d]+)?', '[رقم]', processing_tweet)
	processing_tweet = re.sub('ـ', '', processing_tweet)
	processing_tweet = re.sub(regex_url, '[رابط]', processing_tweet)
	processing_tweet = re.sub(regex_email, '[بريد]', processing_tweet)
	processing_tweet = re.sub(regex_mention, '[مستخدم]', processing_tweet)
	processing_tweet = re.sub('…', r'\.', processing_tweet).strip()
	processing_tweet = remove_redundant_punct(processing_tweet)

	#processing_tweet = re.sub(r'\[ رقم \]|\[رقم \]|\[ رقم\]', ' [رقم] ', processing_tweet)
	processing_tweet = re.sub(r'\[ رابط \]|\[ رابط\]|\[رابط \]', ' [رابط] ', processing_tweet)
	processing_tweet = re.sub(r'\[ بريد \]|\[ بريد\]|\[بريد \]', ' [بريد] ', processing_tweet)
	processing_tweet = re.sub(r'\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]', ' [مستخدم] ', processing_tweet)

	processing_tweet = remove_elongation(processing_tweet)
	if do_farasa_tokenization:
	 processing_tweet = tokenize_arabic_words_farasa(processing_tweet)
	return processing_tweet.strip()
예제 #48
0
 def add(self, word):
     """
     add a new vocalization given by user for unrecongnized word
     @return: vocalized word
     @rtype: none
     """
     word_nm = araby.strip_tashkeel(word)
     if word_nm not in self.dictio:
         self.dictio[word_nm] = [
             word,
         ]
     else:
         if word not in self.dictio[word_nm]:
             self.dictio[word_nm].append(word)
     try:
         self.cdfile = open(self.filename, "a+")
         text = u"%s\t%s\n" % (word_nm, u':'.join(self.dictio[word_nm]))
         self.cdfile.write(text.encode('utf8'))
         self.cdfile.close()
     except:
         print("updating:can't update cutom dictionary'")
예제 #49
0
 def one_word_tagging(self, word, previous=u"", second_previous=u""):
     """
     Guess word classification, into verb, noun, stopwords.
     return a guessed tag
     @param word: the given word.
     @type word: unicode.
     @return: a tag : 't': tool, 'v': verb,
     'n' :noun, 'nv' or 'vn' unidentifed.
     @rtype: unicode
     """
     if not word:
         return ""
     else:
         word_nm = araby.strip_tashkeel(word)
         tag = ''
         if self.cache.has_key(word):
             tag = self.cache.get(word, '')
         else:
             if self.is_stopword(word):
                 tag = 't'
             else:
                 if self.is_noun(word):
                     tag += 'n'
                 if self.is_verb(word):
                     tag += 'v'
             # add the found tag to Cache.
             self.cache[word] = tag
         # if the tagging give an ambigous tag,
         # we can do an contextual analysis
         # the contextual tag is not saved in Cache,
         # because it can be ambigous.
         # for example
         # في ضرب : is a noun
         # قد ضرب : is a verb
         if tag in ("", "vn", "nv"):
             tag = self.context_analyse(previous, word) + "2"
             if tag in ("", "1", "vn1", "nv1"):
                 tag = self.context_analyse(
                     u" ".join([second_previous, previous]), word) + "3"
     return tag
예제 #50
0
    def prepare_dataset(self):
        data = {}
        counter = 0
        with open(self.file, encoding='utf8') as file:
            for line in file:
                letter_ids = []
                diacritic_ids = []
                word_ids = []
                letters, diacritics = araby.separate(line)
                letters = letters[0:-1]
                words = araby.tokenize(line)[0:-1]
                diacritics = diacritics[0:-1]
                for letter in letters:
                    if (letter == '\n') or (letter == '\u200f'):
                        continue

                    letter_ids.append(self.letter_to_id[letter])

                for index, diacritic in enumerate(diacritics):
                    if letters[index] == " ":
                        diacritic_ids.append(self.diacritic_to_id['space'])
                    else:
                        diacritic_ids.append(self.diacritic_to_id[diacritic])

                for word in words:
                    word_ids.append(
                        self.word_to_id[araby.strip_tashkeel(word)])

                instance = (torch.tensor(letter_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(diacritic_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(word_ids,
                                         dtype=torch.long,
                                         requires_grad=False))
                data[counter] = instance
                counter += 1
        return data
예제 #51
0
def preprocess(text, do_farasa_tokenization=True, farasa=None):
    """
	Preprocess takes an input text line an applies the same preprocessing used in araBERT 
				pretraining
	Args:
		text (string): inout text string
		farasa (JavaGateway): pass a py4j gateway to the FarasaSegmenter.jar file 
	Example: 
		from py4j.java_gateway import JavaGateway
		gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
		farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa()
		processed_text = preprocess("Some_Text",do_farasa_tokenization=True , farasa=farasa)

	"""
    text = str(text)
    processing_tweet = araby.strip_tashkeel(text)
    processing_tweet = re.sub(r'\d+\/[ء-ي]+\/\d+\]', '', processing_tweet)
    processing_tweet = re.sub('ـ', '', processing_tweet)
    processing_tweet = re.sub('[«»]', ' " ', processing_tweet)
    #replace the [رابط] token with space if you want to clean links
    processing_tweet = re.sub(regex_url_step1, '[رابط]', processing_tweet)
    processing_tweet = re.sub(regex_url_step2, '[رابط]', processing_tweet)
    processing_tweet = re.sub(regex_url, '[رابط]', processing_tweet)
    processing_tweet = re.sub(regex_email, '[بريد]', processing_tweet)
    processing_tweet = re.sub(regex_mention, '[مستخدم]', processing_tweet)
    processing_tweet = re.sub('…', r'\.', processing_tweet).strip()
    processing_tweet = remove_redundant_punct(processing_tweet)

    processing_tweet = re.sub(r'\[ رابط \]|\[ رابط\]|\[رابط \]', ' [رابط] ',
                              processing_tweet)
    processing_tweet = re.sub(r'\[ بريد \]|\[ بريد\]|\[بريد \]', ' [بريد] ',
                              processing_tweet)
    processing_tweet = re.sub(r'\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]',
                              ' [مستخدم] ', processing_tweet)

    processing_tweet = remove_elongation(processing_tweet)
    if do_farasa_tokenization and farasa is not None:
        processing_tweet = tokenize_arabic_words_farasa(
            processing_tweet, farasa)
    return processing_tweet.strip()
예제 #52
0
 def verb_stamp(self, word):
     """
     generate a stamp for a verb,
     the verb stamp is different of word stamp, by hamza noralization
     remove all letters which can change form in the word :
     - ALEF,
     - YEH,
     - WAW,
     - ALEF_MAKSURA
     - SHADDA
     @return: stamped word
     """
     word = ar.strip_tashkeel(word)
     #The vowels are striped in stamp function
     word = ar.normalize_hamza(word)
     if word.startswith(ar.HAMZA):
         #strip The first hamza
         word = word[1:]
     # strip the last letter if is doubled
     if word[-1:] == word[-2:-1]:
         word = word[:-1]
     return self.verb_stamp_pat.sub('', word)
예제 #53
0
 def generateSuggest(self, word):
     """
     Generate word suggestion 
     @param word: input text.
     @type word: unicode.
     @return: generated suggestion.
     rtype: list of words.
     """
     wordlist = [word, araby.strip_tashkeel(word)]
     codidates = self.edits1(word)
     for condidate in codidates:
         if True:  #self.accepted(condidate):
             wordlist.append(condidate)
     # commun letters error remplacement
     for tup in spellcheck_const.TabReplacment:
         sug = word.replace(tup[0], tup[1])
         if sug != word:
             # evaluate generated suggestion
             if self.accepted(sug):
                 wordlist.append(sug)
     wordlist = list(set(wordlist))
     return wordlist
예제 #54
0
    def ARPosTag(self, List):        
        patterns = [
            ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'),
            ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'),
            ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'),
            ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'),
            ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'),
            ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'),
            ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'),
            ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'),
            ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'),
            ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'),
            ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'),
            ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'),
            ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'),
            ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'),
            #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'),
            ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'),
            ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'),
            ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'),
            ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'),
            ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'),
            ('','اسم'),
        ]
        reg = RegexpTagger(patterns)

        tmpList = []
        for k in List:
            tmp = araby.strip_tashkeel(k)
            tmp2=''
            for i in self.s2:
                if tmp.endswith(i):
                    a=2
                    tmp2=tmp[0:-a]
                else:
                    tmp2=tmp
            tmpList.append(tmp2)        
        return reg.tag(tmpList)  
예제 #55
0
파일: stem_noun.py 프로젝트: reedy/mishkal
def get_suffix_variants(word, suffix, enclitic, mankous=False):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @param mankous: if the noun is mankous ends with Yeh منقوص.
    @type mankous: boolean.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix  #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif not enclitic_nm and araby.is_haraka(suffix):
        if word[-1:] in (araby.YEH, araby.ALEF):
            newsuffix = u""
        elif mankous:
            # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل
            # تحول حركته إلى تنوين كسر
            newsuffix = araby.KASRATAN
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark
예제 #56
0
def get_suffix_variant(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word  = مدرس, suffix = ة, encletic = ي.
    The suffix is convert to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: variant of suffix.
    @rtype: unicode.
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0:
        suffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
    if enclitic_nm == u"" and word[-1:] in (
            araby.ALEF_MAKSURA, araby.YEH,
            araby.ALEF) and suffix in araby.HARAKAT:
        suffix = u""
    return suffix
예제 #57
0
def validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm):
    """
    Test if the given word from dictionary is compabilbe with affixes tags.
    @param noun_tuple: the input word attributes given from dictionary.
    @type noun_tuple: dict.
    @param affix_tags: a list of tags given by affixes.
    @type affix_tags:list.
    @param procletic: first level prefix vocalized.
    @type procletic: unicode.        
    @param encletic_nm: first level suffix vocalized.
    @type encletic_nm: unicode.
    @param suffix_nm: first level suffix vocalized.
    @type suffix_nm: unicode.        
    @return: if the tags are compatible.
    @rtype: Boolean.
    """
    procletic = araby.strip_tashkeel(procletic)
    encletic = encletic_nm
    suffix = suffix_nm

    if u'تنوين' in affix_tags and noun_tuple['word_type'] == "noun_prop":
        return False
    return True
예제 #58
0
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
예제 #59
0
def test():
    options = grabargs()

    filename = options['fname']
    outfilename = options['ofname']
    text = options['text']
    strip_tashkeel = options['strip_tashkeel']
    nocache = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat = options['disableStatistic']
    ignore = options['ignore']
    limit = options['limit']
    compare = options['compare']
    progress = options['progress']
    enable_syn_train = options['train']

    # filename = "samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + " (Tashkeel).txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache:
            vocalizer.disable_cache()
            # print "nocache"
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
            # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled

    # vocalizer.disableShowCollocationMark()
    # print "show delimiter", vocalizer.collo.showDelimiter
    # nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        # dispaly stats for the current line
        print(
            "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine"
        )

    while line and (nolimit or counter <= limit):
        if not line.startswith('# '):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:  # vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                        inputUnvocalizedLine)

                    # stemmer = tashaphyne.stemming.ArabicLightStemmer()
                    # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    # ~inputlist = []
                    # ~for txt in texts:
                    # ~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen", '') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [
                        x.get("semi", '') for x in vocalized_dict
                    ]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print("lists haven't the same length")
                        print(len(inputlist), len(outputlist))
                        print(u"# ".join(inputlist).encode('utf8'))
                        print(u"# ".join(outputlist).encode('utf8'))
                    else:
                        for inword, outword, outsemiword in zip(
                                inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(
                                    inword, outsemiword)
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1
                            else:
                                correct += 1
                                lineCorrect += 1

            # compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            # display stat for every line
            if compare:
                print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  # id
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  # Strip Correct
                    incorrect,  # fully WER
                    WLMIncorrect,  # Strip WER
                    LettersError,  # LER
                    total  # Total
                ))
                if lineTotal:
                    print("%0.2f%%\t" %
                          round(lineCorrect * 100.00 / lineTotal, 2)
                          )  # line Fully correct
                    print("%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2)
                          )  # line Strip correct

            # ~ print result.strip('\n').encode('utf8'),
            if text:
                print result.strip('\n').encode('utf8'),
            else:
                result_line = result.encode('utf8')
                print result_line
                # add line and new line to output file
                outfile.write(result_line)
                outfile.write("\n")

        if progress and not nolimit:
            # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write(
                "\r[%d%%]%d/%d lines    Full %0.2f Strip %0.2f     " % (
                    counter * 100 / limit,
                    counter,
                    limit,
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2)  # Strip Correct
                ))
            # ~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()

        # get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None
    else:
        print("Done")
예제 #60
0
    def detect_chunks(self, wordlist):
        """
        Detect named enteties words in a text and return positions of each phrase.

        Example:
            >>> detect_chunk(u"قال خالد بن رافع  حدثني أحمد بن عنبر عن خاله")
            ((1,3), (6,8))

        @param wordlist: wordlist
        @type wordlist: unicode list
        @return: list of numbers clause positions [(start,end),(start2,end2),]
        @rtype: list of tuple

        """
        started = False
        taglist = []
        previous = ""
        wordlist, wordtag_list = self.preprocess(wordlist)
        for i, word_voc in enumerate(wordlist):
            # get previous tag and next
            prev_tag = wordtag_list[i-1] if i>0 else ""
            next_tag = wordtag_list[i+1] if i<len(wordtag_list)-1 else ""
                
            #save the original word with possible harakat if exist
            word = araby.strip_tashkeel(word_voc)
            if not started:
                # if the word is a start tag word
                # a word which is a start tag like proper noun or an indicator 
                # الكلمة السابقة ليست موسومة، لكن الكلمة الحالية تجعلها كذلك
                if self.is_middle_tuple_tag(word, previous):
                    taglist.pop()
                    taglist.append(self.begintag)                                       
                    taglist.append(self.intertag)                                     
                    started = True

                #:كلمة توسم بنفسها
                elif self.is_wordtag(word):
                    taglist.append(self.begintag)
                    started = True       
                elif self.is_starttag(word):
                   taglist.append(self.begintag)
                   started = True
                else:
                    taglist.append("0")                    
                    started = False
                
            else: # توجد سلسلة منطلقة
                # الكلمة السابقة ليست موسومة، لكن الكلمة الحالية تجعلها كذلك
                if self.is_middle_tuple_tag(word, previous, next_tag):
                    taglist.append(self.intertag)

                #:كلمة توسم بنفسها
                elif self.is_wordtag(word):
                    taglist.append(self.intertag)
                # الكلمة لا تكون موسومة إلا إذا كانت مسبوقة
                elif self.is_middle_wordtag(word, next_tag):
                    # إذا كانت في آخر الجملة لا ت
                    #~ taglist.append(self.intertag+"3")
                    taglist.append(self.intertag)
                else:
                    taglist.append("0")
                    started = False;
            previous = word
        wordlist, taglist = self.postprocess(wordlist, taglist)
        return taglist