def detectNumberWords(text): """ Detect number words in a text. @param text: input text @type text: unicode @return : number words extracted from text @rtype: integer >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا"); خمسمئة وثلاثة وعشرين """ words=araby.tokenize(text) #print words; phrasesContext=extractNumberPhrasesWithinContext(text); for phCon in phrasesContext: if len(phCon)>=3: previous=phCon[0]; phrase=phCon[1]; next=phCon[2]; numberedwords=phrase; numeric = text2number(numberedwords); tags = getPreviousTag(previous); vocalized = vocalizeNumber(araby.stripTashkeel(numberedwords).split(' '), tags); #calcul vocalization similarity : sim = araby.vocalizedSimilarity(numberedwords, vocalized); vocUnit=vocalizeUnit(numeric, next); simUnit = araby.vocalizedSimilarity(vocUnit, next); if sim<0: print(u'\t'.join([str(sim), numberedwords, vocalized, str(numeric), u' '.join([previous,phrase, next]), next, vocUnit, str(simUnit)]).encode('utf8'));
def gwords( text ): ''' (string) -> int Return the number of variants (not occurrences) of gwords in the given text. >>> gwords( '' ) 0 >>> gwords( ' abc ' ) 0 >>> gwords( TEST_FIXTURES['gwords'][0] + ' ' + TEST_FIXTURES['gwords'][1] ) 2 >>> gwords( "%s %s %s" % (TEST_FIXTURES['gwords'][0],\ TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) ) 2 >>> gwords( "%s%s %s" % (TEST_FIXTURES['gwords'][0],\ TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) ) 1 >>> gwords( "%s%s %s %s" % (TEST_FIXTURES['gwords'][0],\ araby.DAMMA, TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) ) 2 >>> gwords( "%s%s %s%s %s" % (TEST_FIXTURES['gwords'][0],\ araby.DAMMA, 'abc', TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) ) 2 ''' """ Search by regular expression then filter the possibilities """ words_set = set( araby.stripTashkeel( text ).split() ) return len( words_set & GWORDS_FORMS )
def text2number(text): """ Convert arabic text into number, for example convert تسعة وعشرون =>29. @param text: input text @type text: unicode @return : number extracted from text @rtype: integer >>> text2number(u"خمسمئة وثلاث وعشرون"); 523 """ #the result total is 0 total=0; # the partial total for the three number partial=0; text=araby.stripTashkeel(text); words=text.split(u' '); #print words; for word in words: if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): word=word[1:]; if word!=u'واحد' and word.startswith(u'و'): word=word[1:]; if word in NumberWords: actualnumber=NumberWords[word]; if actualnumber%1000==0: # the case of 1000 or 1 million if partial==0: partial=1; total+=partial* actualnumber; #re-initiate the partial total partial=0; else: partial+=NumberWords[word]; # add the final partial to total total+=partial; return total
def detectNumberWords(text): """ Detect number words in a text. @param text: input text @type text: unicode @return : number words extracted from text @rtype: integer >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا"); خمسمئة وثلاثة وعشرين """ words=araby.tokenize(text) #print words; phrasesContext=extractNumberPhrasesWithinContext(text); for phCon in phrasesContext: if len(phCon)>=3: previous=phCon[0]; phrase=phCon[1]; next=phCon[2]; numberedwords=phrase; numeric = text2number(numberedwords); tags = getPreviousTag(previous); vocalized = vocalizeNumber(araby.stripTashkeel(numberedwords).split(' '), tags); #calcul vocalization similarity : sim = araby.vocalizedSimilarity(numberedwords, vocalized); vocUnit=vocalizeUnit(numeric, next); simUnit = araby.vocalizedSimilarity(vocUnit, next); if sim<0: print u'\t'.join([str(sim), numberedwords, vocalized, str(numeric), u' '.join([previous,phrase, next]), next, vocUnit, str(simUnit)]).encode('utf8');
def text2number(text): """ Convert arabic text into number, for example convert تسعة وعشرون =>29. @param text: input text @type text: unicode @return : number extracted from text @rtype: integer >>> text2number(u"خمسمئة وثلاث وعشرون"); 523 """ #the result total is 0 total=0; # the partial total for the three number partial=0; text=araby.stripTashkeel(text); words=text.split(u' '); #print words; for word in words: if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): word=word[1:]; if word!=u'واحد' and word.startswith(u'و'): word=word[1:]; if NumberWords.has_key(word): actualnumber=NumberWords[word]; if actualnumber%1000==0: # the case of 1000 or 1 million if partial==0: partial=1; total+=partial* actualnumber; #re-initiate the partial total partial=0; else: partial+=NumberWords[word]; # add the final partial to total total+=partial; return total
def detectNumberPhrasesPosition(wordlist): """ Detect number words in a text and return positions of each phrase. @param wordlist: wordlist @type wordlist: unicode list @return : list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple >>> detectNumberPhrasesPosition(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا"); (1،3)، (6،7) """ wordlist #=text.split(u' '); #print words; phrases = [] startNumber = -1 endNumber = False for i in range(len(wordlist)): word = wordlist[i] if i + 1 < len(wordlist): next = araby.stripTashkeel(wordlist[i + 1]) else: next = None #save the original word with possible harakat if exist word_nm = araby.stripTashkeel(word) key = word_nm # the first word can have prefixes if word_nm and not startNumber and word_nm != u'واحد' and word_nm[ 0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key = word_nm[1:] elif word_nm != u'واحد' and word_nm.startswith(u'و'): key = word_nm[1:] if NumberWords.has_key(key): if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', u'اثنتا') or next in (u'عشر', u'عشرة'): if startNumber < 0: startNumber = i endNumber = i # phrase.append(word); else: if startNumber >= 0: #There are a previous number phrase. phrases.append((startNumber, endNumber)) startNumber = -1 # add the final phrases if startNumber >= 0: #There are a previous number phrase. phrases.append((startNumber, endNumber)) return phrases
def detectNumberPhrasesPosition(wordlist): """ Detect number words in a text and return positions of each phrase. @param wordlist: wordlist @type wordlist: unicode list @return : list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple >>> detectNumberPhrasesPosition(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا"); (1،3)، (6،7) """ wordlist#=text.split(u' '); #print words; phrases = []; startNumber =-1; endNumber =False; for i in range(len(wordlist)): word=wordlist[i]; if i+1<len(wordlist): next=araby.stripTashkeel(wordlist[i+1]); else: next=None; #save the original word with possible harakat if exist word_nm=araby.stripTashkeel(word); key=word_nm; # the first word can have prefixes if word_nm and not startNumber and word_nm!=u'واحد' and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key=word_nm[1:]; elif word_nm!=u'واحد' and word_nm.startswith(u'و'): key=word_nm[1:]; if NumberWords.has_key(key): if not key in (u'أحد',u'إحدى',u'اثنا',u'اثني', u'اثنتي', u'اثنتا') or next in (u'عشر', u'عشرة'): if startNumber<0: startNumber = i; endNumber = i; # phrase.append(word); else: if startNumber>=0: #There are a previous number phrase. phrases.append((startNumber, endNumber)); startNumber=-1; # add the final phrases if startNumber>=0: #There are a previous number phrase. phrases.append((startNumber, endNumber)); return phrases
def getPreviousTag(word): """Get the word tags @param word: given word @type word: unicode @return :word tag @rtype: unicode """ word=araby.stripTashkeel(word); tags=u''; if word in NOUN_NASEB_LIST: return u'منصوب'; elif word in JAR_LIST: return u'مجرور'; elif word in RAFE3_LIST: return u'مرفوع'; else: return u'';
def vocalizeUnit(numeric, unit): """ Vocalize a number words @param numeric: given number @type numeric: integer @param unit: unit to vocalize @type unit: unicode @return: the vocalized unit, or unit word if itsnt a unit word. @rtype: unicode """ #detect tags # The given word is not a unit unit_nm = araby.stripTashkeel(unit); if not isUnit(unit_nm): return unit; tags= u""; vocalizedUnit=unit; # العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها # هذه الحالة لا تبرمج if numeric>=0 and numeric <=2: return unit; # الإضافة إلى تمييز مضاف إليه مجرور مفرد # تممييز الألف والمئة والمليون والمليار # يتطلب إضافة إلى مفرد # مثلا ألف رجل elif numeric % 100 == 0 or numeric % 1000 == 0: tag='SingleMajrour'; vocalizedUnit = UnitWords[unit_nm]['a']; # العدد المفرد يتطلب # إضافة إلى الجمع elif numeric % 100 <=10: tags+="Plural"; vocalizedUnit = UnitWords[unit_nm]['p']; elif numeric % 100 <100: tags+='SingleMansoub'; vocalizedUnit = UnitWords[unit_nm]['n']; else: tags=''; vocalizedUnit = UnitWords[unit_nm]['i']; if not vocalizedUnit: return 'Error'+tags; else: return vocalizedUnit;
def normalize_text(text): ''' return normalized text Normalisation steps: * strip diacritics * strip tatweel * normalize lam-alef * normalize hamza * normalize spellerrors >>> normalize_text('') u'' ''' text = araby.stripTashkeel(text) text = araby.stripTatweel(text) text = normalize_lamalef(text) text = normalize_hamza(text) text = normalize_spellerrors(text) return text
def vocalizeNumber(wordlist, synTags=""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param synTags: tags about the clause @type synTags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist=[]; prefix=u""; next=u""; #detect tags # we can pass tags to this number word tags= synTags; majrour = False; if len(wordlist)==1: #return araby.stripTashkeel(wordlist[0]); word=wordlist[0]; word_nm=araby.stripTashkeel(word); key=word_nm; voc=word; # the first word can have prefixes if word_nm and not wordlist and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): if word_nm[0] in (u'ل', u'ب', u'ك'): tags +=u"مجرور"; key=word[1:]; elif word_nm!=u'واحد' and word_nm.startswith(u'و'): key=word_nm[1:]; # تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس if key in NumberWords and not key in (u'عشر',u'خمس',u'سبع', u'تسع',u'خمسا', u'سبعا',u'تسعا',u'عشرا', u'ألفين',u'عشرة', u'صفر', u'ألف'): voc =prefix+VocalizedNumberWords[key]['i'] return [voc,]; for i in range(len(wordlist)): #save the original word with possible harakat if exist word=wordlist[i]; word_nm=araby.stripTashkeel(word); key=word_nm; # the first word can have prefixes if i==0 and word_nm and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): if word_nm[0] in (u'ل', u'ب', u'ك'): tags +=u"مجرور"; key=word[1:]; elif word_nm!=u'واحد' and word_nm.startswith(u'و'): key=word_nm[1:]; if key in NumberWords: if word_nm.endswith(u'ين') : tags +=u"مجهول"; # إما مجرور أو منصوب elif word_nm.endswith(u'ان') or word_nm.endswith(u'ون') : tags +=u"مرفوع"; #add tashkeel #wordlist=araby.stripTashkeel(u" ".join(wordlist)).split(' '); previousKey=u''; for i in range(len(wordlist)): word =wordlist[i]; if i+1<len(wordlist): next=wordlist[i+1]; else: next =u""; key=word; # the first word can have prefixes if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key=word[1:]; prefix=word[0]; if prefix in (u'و', u'ف', u'ك'): prefix +=u'َ' elif prefix in ( u'ل', u'ب'): prefix +=u'ِ' else: prefix=''; if key in VocalizedNumberWords: voc=u''; if VocalizedNumberWords[key]['s']=="*": voc =prefix+VocalizedNumberWords[key]['i'] # مبني على النصب في حالة المركب العددي elif next==u'عشر' or next==u'عشرة': voc =prefix+VocalizedNumberWords[key]['n'] # مبني على النصب في حالة المركب العددي elif key==u'عشر' and previousKey in NumberTenMasculinUnits: voc =u'عَشَرَ' elif key==u'عشرة' and previousKey in NumberTenFemininUnits: voc =u'عَشْرَةَ' elif u'مرفوع' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['r2'] else: voc =prefix+VocalizedNumberWords[key]['r'] elif u'مجهول' in tags: voc =prefix+VocalizedNumberWords[key]['i'] elif u'مجرور' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['j2'] else: voc =prefix+VocalizedNumberWords[key]['j'] # منصوب elif u'منصوب' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['n2'] else: voc =prefix+VocalizedNumberWords[key]['n'] else: voc =prefix+VocalizedNumberWords[key]['i'] newlist.append(voc) else: newlist.append(prefix+key); previousKey=key; return newlist;
def vocalizeNumber(wordlist, synTags=""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param synTags: tags about the clause @type synTags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist=[]; prefix=u""; next=u""; #detect tags # we can pass tags to this number word tags= synTags; majrour = False; if len(wordlist)==1: #return araby.stripTashkeel(wordlist[0]); word=wordlist[0]; word_nm=araby.stripTashkeel(word); key=word_nm; voc=word; # the first word can have prefixes if word_nm and not wordlist and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): if word_nm[0] in (u'ل', u'ب', u'ك'): tags +=u"مجرور"; key=word[1:]; elif word_nm!=u'واحد' and word_nm.startswith(u'و'): key=word_nm[1:]; # تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس if NumberWords.has_key(key) and not key in (u'عشر',u'خمس',u'سبع', u'تسع',u'خمسا', u'سبعا',u'تسعا',u'عشرا', u'ألفين',u'عشرة', u'صفر', u'ألف'): voc =prefix+VocalizedNumberWords[key]['i'] return [voc,]; for i in range(len(wordlist)): #save the original word with possible harakat if exist word=wordlist[i]; word_nm=araby.stripTashkeel(word); key=word_nm; # the first word can have prefixes if i==0 and word_nm and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): if word_nm[0] in (u'ل', u'ب', u'ك'): tags +=u"مجرور"; key=word[1:]; elif word_nm!=u'واحد' and word_nm.startswith(u'و'): key=word_nm[1:]; if NumberWords.has_key(key): if word_nm.endswith(u'ين') : tags +=u"مجهول"; # إما مجرور أو منصوب elif word_nm.endswith(u'ان') or word_nm.endswith(u'ون') : tags +=u"مرفوع"; #add tashkeel #wordlist=araby.stripTashkeel(u" ".join(wordlist)).split(' '); previousKey=u''; for i in range(len(wordlist)): word =wordlist[i]; if i+1<len(wordlist): next=wordlist[i+1]; else: next =u""; key=word; # the first word can have prefixes if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key=word[1:]; prefix=word[0]; if prefix in (u'و', u'ف', u'ك'): prefix +=u'َ' elif prefix in ( u'ل', u'ب'): prefix +=u'ِ' else: prefix=''; if VocalizedNumberWords.has_key(key): voc=u''; if VocalizedNumberWords[key]['s']=="*": voc =prefix+VocalizedNumberWords[key]['i'] # مبني على النصب في حالة المركب العددي elif next==u'عشر' or next==u'عشرة': voc =prefix+VocalizedNumberWords[key]['n'] # مبني على النصب في حالة المركب العددي elif key==u'عشر' and previousKey in NumberTenMasculinUnits: voc =u'عَشَرَ' elif key==u'عشرة' and previousKey in NumberTenFemininUnits: voc =u'عَشْرَةَ' elif u'مرفوع' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['r2'] else: voc =prefix+VocalizedNumberWords[key]['r'] elif u'مجهول' in tags: voc =prefix+VocalizedNumberWords[key]['i'] elif u'مجرور' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['j2'] else: voc =prefix+VocalizedNumberWords[key]['j'] # منصوب elif u'منصوب' in tags: if next.startswith(u'و'): voc =prefix+VocalizedNumberWords[key]['n2'] else: voc =prefix+VocalizedNumberWords[key]['n'] else: voc =prefix+VocalizedNumberWords[key]['i'] newlist.append(voc) else: newlist.append(prefix+key); previousKey=key; return newlist;