def __init__(self, result_dict = None, order = -1): # ToDo # copy the super class attributes to curesult_dictrrent classe #stemmedword.stemmedWord.__init__(self, result_dict.get_dict()) if result_dict: self.__dict__ = result_dict.__dict__.copy() self.unvocalized = araby.strip_tashkeel(self.vocalized) self.unvoriginal = araby.strip_tashkeel(self.original) self.tag_verbal_factor = 0 self.tag_nominal_factor = 0 self.tag_kana_rafe3 = False if self.is_verb(): self.tag_kana_rafe3 = self._is_kana_rafe3() if self.is_stopword(): self.tag_kana_rafe3 = self._is_kana_rafe3() self.tag_nominal_factor = self.__get_nominal_factor() #verbal factor self.tag_verbal_factor = self.__get_verbal_factor() self.tag_addition = self._is_addition() self.tag_break = self._is_break() self.forced_word_case = False self.syntax = u"" # used for syntaxique analysis porpos self.semantic = u"" # used for semantic analysis porposes self.forced_wordtype = False self.order = order self.next = {} self.previous = {} self.sem_next = {} self.sem_previous = {} self.score = 0 self.rule = 0 # rule used to select the current case in vocalization
def check_normalized(word_vocalised, resulted_data): """ If the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. this function filter normalized resulted word according the given word, and give ذئب. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print word_vocalised.encode('utf8') filtred_data = [] inputword = araby.strip_tashkeel(word_vocalised) for item in resulted_data: if 'vocalized' in item.__dict__ : #.has_key('vocalized') : #~ if 'vocalized' in item : #~ outputword = araby.strip_tashkeel(item['vocalized']) outputword = araby.strip_tashkeel(item.__dict__['vocalized']) #print u'\t'.join([inputword, outputword]).encode('utf8') if inputword == outputword: #item['tags'] += ':a' filtred_data.append(item) return filtred_data
def get_word_variant(word, suffix, encletic): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( first level). @type suffix: unicode. @param encletic: encletic( second level). @type encletic: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) encletic_nm = araby.strip_tashkeel(encletic) long_suffix_nm = suffix_nm + encletic_nm #if the word ends by a haraka word_stem = araby.strip_lastharaka(word_stem) # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية if word_stem.endswith(araby.TEH_MARBUTA): if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem = word_stem[:-1] # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق #مدرسة +ين = مدرستين elif long_suffix_nm != u"": word_stem = word_stem[:-1]+araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA): # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء # مستوى +ان = مستويان # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء if suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل تتحول الألف المقصورة إلى ألف elif encletic_nm != u"": word_stem = word_stem[:-1]+araby.ALEF elif word_stem.endswith(araby.KASRA + araby.YEH): # الاسم المنقوص ينتهي بياء قبلها مكسور # إذا كان لا ضمير واللاحقة فقط حركات # نحذف ال if not encletic_nm and not suffix_nm : word_stem = word_stem[:-2] #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def detect_number_phrases_position(wordlist): """ Detect number words in a text and return positions of each phrase. Example: >>> txt = u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا" >>> wordlist = araby.tokenize(txt) >>> positions_phrases = detect_number_phrases_position(wordlist) >>> print positions_phrase >>> print positions_phrases [(1, 3), (6, 7)] @param wordlist: wordlist @type wordlist: unicode list @return: list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple """ #~ wordlist# = text.split(u' ') #print words phrases = [] startnumber = -1 endnumber = False #~ taglist = [] for i, word in enumerate(wordlist): #~ word = wordlist[i] if i+1 < len(wordlist): nextword = araby.strip_tashkeel(wordlist[i+1]) else: nextword = None #save the original word with possible harakat if exist word_nm = araby.strip_tashkeel(word) key = word_nm # the first word can have prefixes if word_nm and not startnumber and word_nm != u'واحد' \ and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key = word_nm[1:] elif word_nm != u'واحد' and word_nm.startswith(u'و'): key = word_nm[1:] if key in nbconst.NUMBER_WORDS or key.isnumeric(): if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \ u'اثنتا') or nextword in (u'عشر', u'عشرة'): if startnumber < 0: startnumber = i endnumber = i # phrase.append(word) else: if startnumber >= 0: #There are a previous number phrase. phrases.append((startnumber, endnumber)) startnumber = -1 # add the final phrases if startnumber >= 0: #There are a previous number phrase. phrases.append((startnumber, endnumber)) return phrases
def is_possible_collocation(self, list2, context = "", lenght = 2): """ Guess if the given list is a possible collocation This is used to collect unkown collocations, from user input return True oor false @param wordlist: word of list, 2 or more words. @type wordlist: list of unicode. @param lenght: minimum number of words in the collocation @type lenght: integer. @return : the rule of found collocation, 100 default. @rtype: interger. """ if len(list2)<lenght: return 0 else: item_v1 = list2[0] item_v2 = list2[1] item1 = araby.strip_tashkeel(item_v1) item2 = araby.strip_tashkeel(item_v2) #if item1[-1:] in (u".", u"?", u", ", u'[', u']', u'(', ')'): # return 0 if not cconst.token_pat.search(item1) or not \ cconst.token_pat.search(item2) : return -1 #else: return 100 elif item1 in cconst.ADDITIONAL_WORDS : return 10 elif item1 in cconst.NAMED_PRIOR : return 15 elif (item2 not in cconst.SPECIAL_DEFINED): if item2.startswith(u'ال') and item1.startswith(u'ال'): return 20 elif item1.endswith(u'ة') and item2.startswith(u'ال'): return 30 #حالة الكلمات التي تبدأ بلام الجر والتعريف # لا داعي لها لأنها دائما مجرورة #if item2.startswith(u'لل'): # return 40 elif item1.endswith(u'ة') and item2.endswith(u'ة') : return 40 #if item1.endswith(u'ي') and item2.endswith(u'ي'): # return 60 elif context != u"" and context in cconst.tab_noun_context \ and item2.startswith(u'ال') : return 50 #return True elif item1.endswith(u'ات') and item2.startswith(u'ال') : return 60 return 100
def detect_numbers(wordlist): """ Detect number words in a text and return a taglist as BIO. Example: >>> wordlist = araby.tokenize(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا") >>> detect_numbers(wordlist) ['DO', 'DB', 'DI', 'DI', 'DO', 'DO', 'DB', 'DI', 'DO'] @param wordlist: wordlist @type wordlist: unicode list @return: list of tags BIO @rtype: list of unicode """ #~ phrases = [] starts = False taglist = [] for i, word in enumerate(wordlist): #~ word = wordlist[i] if i+1 < len(wordlist): nextword = araby.strip_tashkeel(wordlist[i+1]) else: nextword = None #save the original word with possible harakat if exist word_nm = araby.strip_tashkeel(word) key = word_nm # the first word can have prefixes if word_nm and not starts and word_nm != u'واحد' \ and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key = word_nm[1:] elif word_nm != u'واحد' and word_nm.startswith(u'و'): key = word_nm[1:] if key in nbconst.NUMBER_WORDS or key.isnumeric(): if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \ u'اثنتا') or nextword in (u'عشر', u'عشرة'): if not starts: taglist.append("DB") starts = True else: taglist.append("DI") else: starts = False taglist.append("O") else: starts = False taglist.append("O") return taglist
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) #if the word ends by a haraka strip the haraka if the suffix is not null if suffix: word_stem = araby.strip_lastharaka(word_stem) if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def search_arabic(self, q, fetch_subgraph = True, limit = DEFAULT_LIMIT, fetchplan = DEFAULT_FETCHPLAN): """ Searches for given label intelligently handling vocalization. (This does not make much sense without a fetchplan as you will get index nodes only.) """ # If query is not vocalized, search unvocalized index and eventually # return subtree if not araby.is_vocalized(q): return self.search_index(q, fetch_subgraph, "ArabicNode.unvocalized_label", limit, fetchplan) # If it is vocalized, search unvocalized index and check for # "compatibility" of vocalization matches = self.search_index(araby.strip_tashkeel(q), False, "ArabicNode.unvocalized_label", limit) rids = [n.rid for n in matches.primary_results if Tools.is_vocalized_like(q, n.data["label"])] # Ignore vocalization if there is no compatible one if not rids: rids = [n.rid for n in matches.primary_results] return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
def wordtag(text): """ word tagginginto noun, verb, tool """ import naftawayh.wordtag tagger = naftawayh.wordtag.WordTagger() word_list = token_text(text) if len(word_list) == 0: return [] else: list_result = [] second_previous ="" previous = u"" #~previous_tag = "" for word in word_list: word_nm = araby.strip_tashkeel(word) tag = '' if tagger.is_stopword(word): tag = 't' else: if tagger.is_noun(word): tag += 'n' if tagger.is_verb(word): tag += 'v' if tag in ("", "nv"): tag = tagger.context_analyse(previous, word)+"1" if tag in ("", "nv1", "vn1"): tag = tagger.context_analyse(u" ".join([second_previous, previous]), word)+"2" list_result.append({'word':word, 'tag': tag}) second_previous = previous previous = word_nm #~previous_tag = tag return list_result
def detect_number_words(text): """ Detect number words in a text. @param text: input text @type text: unicode @return : number words extracted from text @rtype: integer >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا") خمسمئة وثلاثة وعشرين """ #~ words = araby.tokenize(text) #print words phrases_context = extract_number_context(text) for ph_con in phrases_context: if len(ph_con) >= 3: previous = ph_con[0] phrase = ph_con[1] nextword = ph_con[2] numberedwords = phrase numeric = text2number(numberedwords) tags = get_previous_tag(previous) vocalized = vocalize_number(araby.strip_tashkeel(\ numberedwords).split(' '), tags) #calcul vocalization similarity : sim = araby.vocalized_similarity(numberedwords, vocalized) voc_unit = vocalize_unit(numeric, nextword) sim_unit = araby.vocalized_similarity(voc_unit, \ nextword) if sim < 0: print u'\t'.join([str(sim), numberedwords, vocalized, \ str(numeric), u' '.join([previous, phrase, nextword]), \ nextword, voc_unit, str(sim_unit)]).encode('utf8')
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word # print word.encode('utf8') #HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, # KASRATAN, FATHATAN) suffix_nm = araby.strip_tashkeel(suffix) #if the word ends by a haraka word_stem = araby.strip_lastharaka(word_stem) if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in ( araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem = word_stem[:-1] elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def get_suffix_variants(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix): newsuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def vocalize_named(wordlist, syn_tags = ""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param syn_tags: tags about the clause @type syn_tags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist = [] #~ prefix = u"" #~ nextword = u"" #detect tags # we can pass tags to this number word tags = syn_tags bin_count = 0 for i in range(len(wordlist)): #save the original word with possible harakat if exist word = wordlist[i] word_nm = araby.strip_tashkeel(word) # the first word can have prefixes if i == 0 and word_nm: # word to get majrour tag if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',): tags += u"مجرور" elif word_nm in (u'أبو', ): tags += u"مرفوع" elif word_nm in (u'أبا', ): tags += u"منصوب"
def set_vocalized(self, newvocalized): """ Set the vocalized word @param newvocalized: the new given vocalized. @type newvocalized: unicode string """ self.vocalized = newvocalized self.unvocalized = araby.strip_tashkeel(newvocalized)
def detect_number_phrases_position(wordlist): """ Detect number words in a text and return positions of each phrase. @param wordlist: wordlist @type wordlist: unicode list @return : list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple >>> detect_number_phrases_position(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا") (1،3)، (6،7) """ #~ wordlist# = text.split(u' ') #print words phrases = [] startnumber = -1 endnumber = False taglist = [] for i in range(len(wordlist)): word = wordlist[i] if i+1 < len(wordlist): nextword = araby.strip_tashkeel(wordlist[i+1]) else: nextword = None #save the original word with possible harakat if exist word_nm = araby.strip_tashkeel(word) key = word_nm # the first word can have prefixes if word_nm and not startnumber and word_nm != u'واحد' \ and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key = word_nm[1:] elif word_nm != u'واحد' and word_nm.startswith(u'و'): key = word_nm[1:] if nbconst.NumberWords.has_key(key): if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \ u'اثنتا') or nextword in (u'عشر', u'عشرة'): if startnumber < 0: startnumber = i endnumber = i # phrase.append(word) else: if startnumber >= 0: #There are a previous number phrase. phrases.append((startnumber, endnumber)) startnumber = -1 # add the final phrases if startnumber >= 0: #There are a previous number phrase. phrases.append((startnumber, endnumber)) return phrases
def detect_numbers(wordlist): """ Detect number words in a text and return a taglist as BIO. @param wordlist: wordlist @type wordlist: unicode list @return : list of tags BIO @rtype: list of unicode >>> detect_numbers(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا") ['O', 'B', 'I', 'I', 'O', 'O', 'B', 'I', 'O'] """ phrases = [] starts = False taglist = [] for i in range(len(wordlist)): word = wordlist[i] if i+1 < len(wordlist): nextword = araby.strip_tashkeel(wordlist[i+1]) else: nextword = None #save the original word with possible harakat if exist word_nm = araby.strip_tashkeel(word) key = word_nm # the first word can have prefixes if word_nm and not starts and word_nm != u'واحد' \ and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key = word_nm[1:] elif word_nm != u'واحد' and word_nm.startswith(u'و'): key = word_nm[1:] if nbconst.NumberWords.has_key(key): if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \ u'اثنتا') or nextword in (u'عشر', u'عشرة'): if not starts: taglist.append("DB") starts = True else: taglist.append("DI") else: starts = False taglist.append("DO") else: starts = False taglist.append("DO") return taglist
def is_stopword(self, word): """ Return True if the word is a stopword, according a predefined list. @param word: the previous word. @type word: unicode. @return: is the word a stop word @rtype: Boolean """ return stopwords.STOPWORDS.has_key(word) or stopwords.STOPWORDS.has_key(araby.strip_tashkeel(word))
def lookup(self, word): """ look up for a vocalization given by user for unrecongnized words @return: vocalized word @rtype: list of unicode """ word = araby.strip_tashkeel(word) if word in self.dictio: return self.dictio[word] else: return [word, ]
def add_entry(self, entry): """ Add lexicon entry to lexicon. Updates "lookup table" entries_by_surface_form. """ self.entries.append(entry) self.roots.add(entry.root) for s in entry.get_surface_forms(): # Add vocalized and non-vocalized forms self.entries_by_surface_form[s].append(entry) self.entries_by_surface_form[araby.strip_tashkeel(s)].append(entry)
def add(self, word, suggestList): if word!=u"" and suggestList!=[] and type(suggestList).__name__=='list': #ToDo: adding different suggestion into one list; # NB: this is time eater because if the word is frequent. # if self.dict.has_key(word): # # if the dict has previous suggestions for the word, # # add new suggestions and remove duplicata; # suggestList+=self.dict[word]; # suggestList=set(suggestList); # self.dict[word]=suggestList; #else: self.dict[araby.strip_tashkeel(word)]=suggestList;
def validate_tags(stop_tuple, affix_tags, procletic, encletic_nm , suffix_nm): """ Test if the given word from dictionary is compabilbe with affixes tags. @param stop_tuple: the input word attributes given from dictionary. @type stop_tuple: dict. @param affix_tags: a list of tags given by affixes. @type affix_tags:list. @param procletic: first level prefix vocalized. @type procletic: unicode. @param encletic_nm: first level suffix vocalized. @type encletic_nm: unicode. @param suffix_nm: first level suffix vocalized. @type suffix_nm: unicode. @return: if the tags are compaatible. @rtype: Boolean. """ procletic = araby.strip_tashkeel(procletic) encletic = encletic_nm suffix = suffix_nm if u"تعريف" in affix_tags and not stop_tuple['definition']: return False; if u"تعريف" in affix_tags and stop_tuple['defined']: return False; #~preposition if u':جر:'in affix_tags and not stop_tuple['preposition']: return False if u':جر:'in affix_tags and not stop_tuple['preposition']: return False if u"متحرك" in affix_tags and not stop_tuple['is_inflected']: return False if u"مضاف" in affix_tags and not stop_tuple['pronoun']: return False if u"مضاف" in affix_tags and stop_tuple['defined']: return False #~interrog if u"استفهام" in affix_tags and not stop_tuple['interrog']: return False #~conjugation #~qasam if u"قسم" in affix_tags and not stop_tuple['qasam']: return False #~ #~defined #~is_inflected #~tanwin if u"تنوين" in affix_tags and not stop_tuple['tanwin']: return False #~action #~object_type #~need return True
def search(self, citation_form): """ Seach for given citation form. If not found try again without diacritcs. Returns a list of ArabicDictionaryEntries. """ if citation_form not in self.entries_by_surface_form: citation_form = araby.strip_tashkeel(citation_form) try: return self.entries_by_surface_form[citation_form] except IndexError: return []
def main(): for table in tags.keys(): for word in eval(table): word_nm = araby.strip_tashkeel(word) if not word_nm in factor_table: factor_table[word_nm]={word:[tags.get(table, ""),], } else: if not word in factor_table[word_nm]: factor_table[word_nm][word]=[tags.get(table, ""), ] else: factor_table[word_nm][word].append(tags.get(table, "")) for item in factor_table: print (u"u'%s':"%item).encode("utf8"), utf8repr.repr(factor_table[item]).encode('utf8'), ','
def vocalize_named(wordlist, syn_tags = ""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param syn_tags: tags about the clause @type syn_tags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist = [] #~ prefix = u"" #~ nextword = u"" #detect tags # we can pass tags to this number word tags = syn_tags bin_count = 0 for i in range(len(wordlist)): #save the original word with possible harakat if exist word = wordlist[i] word_nm = araby.strip_tashkeel(word) # the first word can have prefixes if i == 0 and word_nm: # word to get majrour tag if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',): tags += u"مجرور" elif word_nm in (u'أبو', ): tags += u"مرفوع" elif word_nm in (u'أبا', ): tags += u"منصوب" # select vocalization if word_nm == u'بن': bin_count += 1 #treat first bin according to tags if bin_count == 1: if u'مجرور' in tags: voc = u'بْنِ' elif u'مرفوع' in tags: voc = u'بْنُ' elif u'منصوب' in tags: voc = u'بْنَ' else: voc = u'بْن' else: # u'مجرور' voc = u'بْنِ' #Todo Vocalize names else: voc = word newlist.append(voc) return newlist
def get_unvocalized(self, ): """ Get the unvocalized form of the input word @return: the given unvocalized. @rtype: unicode string """ if self.unvocalized: return self.unvocalized else: if self.vocalized: self.unvocalized = araby.strip_tashkeel(self.vocalized) else : return u"" return self.unvocalized
def get_unvoriginal(self, ): """ Get the unvocalized original form of the input word @return: the given unvocalized original. @rtype: unicode string """ if self.unvoriginal: return self.unvoriginal else : if self.original: self.unvoriginal = araby.strip_tashkeel(self.original) else: return u"" return self.unvoriginal
def get_stem_variants(stem, prefix, suffix): """ Generate the Noun stem variants according to the affixes. For example مدرستي = >مدرست+ي = > مدرسة +ي. Return a list of possible cases. @param stem: the input stem. @type stem: unicode. @param prefix: prefixe. @type prefix: unicode. @param suffix: suffixe. @type suffix: unicode. @return: list of stem variants. @rtype: list of unicode. """ #some cases must have some correction #determinate the prefix and suffix types # create a list, the first item is the verb without changes prefix_possible_noun_list = set([stem]) # Prefix prefix = araby.strip_tashkeel(prefix) suffix = araby.strip_tashkeel(suffix) possible_noun_list = prefix_possible_noun_list if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): possible_noun = stem+araby.TEH_MARBUTA possible_noun_list.add(possible_noun) if suffix == "" or suffix == araby.YEH+araby.NOON or \ suffix == araby.WAW+araby.NOON: possible_noun = stem+araby.YEH possible_noun_list.add(possible_noun) if stem.endswith(araby.YEH): possible_noun = stem[:-1]+araby.ALEF_MAKSURA possible_noun_list.add(possible_noun) #to be validated validated_list = possible_noun_list return validated_list
def get_future_type(word): word_nm = araby.strip_tashkeel(word) v = word if len(word_nm) != 3: return araby.FATHA elif word_nm[1] == araby.ALEF: v = word_nm[0]+araby.FATHA + araby.ALEF + word_nm[2] +araby.FATHA elif word_nm.startswith(araby.ALEF_MAKSURA): v = word_nm[0]+araby.FATHA + word_nm[1] + araby.FATHA +word_nm[2] for i in (1,2,3,4,5,6): v2 = v+str(i) if v2 in triverbtable.TriVerbTable: return triverbtable.TriVerbTable[v2]['haraka'] return araby.FATHA
def lookup(s): """ Looks up a unicode string s in Langenscheidt online dict Returns a list of unicode tuples (lemma, transcription, translations) """ # Stupid langenscheid cannot handle tashkeel ... s = araby.strip_tashkeel(s).encode("utf-8") url = "http://de.langenscheidt.com/arabisch-deutsch/%s" % s #print "Fetching %s" % url doc = urllib.urlopen(url).read() root = soupparser.fromstring(doc) children = get_children_by_class(root, RESULT_BLOCK_CLASS) results = [handle_result_block(e) for e in children] return [r for r in results if r]
def create_arabic_node(self, cluster_name, label, **kwargs): """ Checks that label is an arabic string, removes tatweel and normalizes ligatures. Adds unvocalized_label. """ label = araby.normalize_ligature(araby.strip_tatweel(label)) label = label.replace(araby.SMALL_ALEF, "") if not araby.is_arabicstring(label): raise RuntimeError("'%s' is not an Arabic string" % label) if "unvocalized_label" not in kwargs: kwargs["unvocalized_label"] = araby.strip_tashkeel(label) return self.create_node(cluster_name, label, **kwargs)
def get_previous_tag(word): """Get the word tags @param word: given word @type word: unicode @return: word tag @rtype: unicode """ word = araby.strip_tashkeel(word) #~ tags = u'' if word in named_const.NOUN_NASEB_LIST: return u'منصوب' elif word in named_const.JAR_LIST: return u'مجرور' elif word in named_const.RAFE3_LIST: return u'مرفوع' else: return u''
def vocalize_unit(numeric, unit): """ Vocalize a number words @param numeric: given number @type numeric: integer @param unit: unit to vocalize @type unit: unicode @return: the vocalized unit, or unit word if itsnt a unit word. @rtype: unicode """ #detect tags # The given word is not a unit unit_nm = araby.strip_tashkeel(unit) if not is_unit(unit_nm): return unit tags = "" vocalizedunit = unit # العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها # هذه الحالة لا تبرمج if numeric >= 0 and numeric <= 2: return unit # الإضافة إلى تمييز مضاف إليه مجرور مفرد # تممييز الألف والمئة والمليون والمليار # يتطلب إضافة إلى مفرد # مثلا ألف رجل elif numeric % 100 == 0 or numeric % 1000 == 0: tags = 'SingleMajrour' vocalizedunit = nbconst.UnitWords[unit_nm]['a'] # العدد المفرد يتطلب # إضافة إلى الجمع elif numeric % 100 <= 10: tags += "Plural" vocalizedunit = nbconst.UnitWords[unit_nm]['p'] elif numeric % 100 < 100: tags += 'SingleMansoub' vocalizedunit = nbconst.UnitWords[unit_nm]['n'] else: tags = '' vocalizedunit = nbconst.UnitWords[unit_nm]['i'] if not vocalizedunit: return 'Error' + tags else: return vocalizedunit
def clean_str(text): #Option1--> normalizing search = [u"أ",u"إ",u"آ",u"ة",u"_",u"-",u"/",u".",u"،",u" و ",u" يا ",u'"',u"ـ",u"'",u"ى",u"\\",u'\n', u'\t',u'"',u'?',u'؟',u'!'] replace = [u"ا",u"ا",u"ا",u"ه",u" ",u" ",u"",u"",u"",u" و",u" يا",u' " ',u"",u"",u"ي",u"",u' ', u' ',u' ',u' ',u' ',u' ! '] text=araby.normalize_ligature(text) text=unicodedata.normalize('NFKD',text) text=araby.strip_tashkeel(text)#remove tashkeel p_longation = re.compile(r'(.)\1+')#remove longation subst = r"\1\1" text = re.sub(p_longation, subst, text) text = text.replace(u'وو', u'و') text = text.replace(u'يي', u'ي') text = text.replace(u'اا', u'ا') for i in range(0, len(search)): text = text.replace(unicodedata.normalize('NFKD',search[i]), unicodedata.normalize('NFKD',replace[i])) #trim text = text.replace(u'ئ', u'ئ') text = text.strip() return text
def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str: """ AraBERTv1 preprocessing Function """ text = str(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(REGEX_URL_STEP1, "[رابط]", text) text = re.sub(REGEX_URL_STEP2, "[رابط]", text) text = re.sub(REGEX_URL, "[رابط]", text) text = re.sub(REGEX_EMAIL, "[بريد]", text) text = re.sub(REGEX_MENTION, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text, ) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) text = " ".join(text.split()) return text
def generate_affix_list(self, vocalized=True): """ generate all affixes """ word = u"قصد" # generate all possible word forms forms = self.generate_forms(word) # remove diacritics if not vocalized: list_affixes = [araby.strip_tashkeel(d[0]) for d in forms] else: list_affixes = [d[0] for d in forms] # remove duplicated list_affixes = list(set(list_affixes)) # remove stem and get only affixes # those variants are used to represent verb vocalizations when conjugation variants = [u'قَصَد', u'قْصَد', u"قصد"] for word in variants: list_affixes = [x.replace(word, '-') for x in list_affixes] return list_affixes
def is_stopword(self, word): """ Return True if the word is a stopword, according a predefined list. @param word: the previous word. @type word: unicode. Example: >>> import naftawayh.wordtag >>> word_list=(u'بالبلاد', u'بينما', u'أو', u'انسحاب', u'انعدام', u'انفجار', u'البرنامج', u'بانفعالاتها', u'العربي', u'الصرفي', u'التطرف', u'اقتصادي', ) >>> tagger = naftawayh.wordtag.WordTagger(); >>> #test word by word >>> for word in word_list: >>> if tagger.is_noun(word): >>> print(u'%s is noun'%word) >>> if tagger.is_verb(word): >>> print(u'%s is verb'%word) >>> if tagger.is_stopword(word): >>> print(u'%s is stopword'%word) بالبلاد is noun بينما is noun بينما is verb أو is noun أو is verb أو is stopword انسحاب is noun انعدام is noun انفجار is noun البرنامج is noun بانفعالاتها is noun العربي is noun الصرفي is noun التطرف is noun اقتصادي is noun @return: is the word a stop word @rtype: Boolean """ word_nm = araby.strip_tashkeel(word) return word in stopwords.STOPWORDS or word_nm in stopwords.STOPWORDS
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
def segment(self, word): """ generate a list of all possible segmentation positions (lef, right) of the treated word by the stemmer. Example: >>> ArListem = ArabicLightStemmer() >>> word = u'فتضربين' >>> print ArListem.segment(word) set(([(1, 5), (2, 5), (0, 7)]) @return: List of segmentation @rtype: set of tuple of integer. """ self.word = word self.unvocalized = araby.strip_tashkeel(word) # word, harakat = araby.separate(word) word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) # get all lefts position of prefixes lefts = self.lookup_prefixes(word) # get all rights position of suffixes rights = self.lookup_suffixes(word) if lefts: self.left = max(lefts) else: self.left = -1 if rights: self.right = min(rights) else: self.right = -1 #~ ln = len(word) self.segment_list = set([(0, len(word))]) # print lefts, rights for i in lefts: for j in rights: if j >= i + 2: self.segment_list.add((i, j)) # filter segment according to valid affixes list self.left, self.right = self.get_left_right(self.segment_list) return self.segment_list
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) # تحويل الألف المقصورة إلى ياء في مثل إلى => إليك if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm: if word_stem == u"سِوَى": word_stem = word_stem[:-1] + araby.ALEF else: word_stem = word_stem[:-1] + araby.YEH + araby.SUKUN # تحويل الهمزة حسب موقعها elif word_stem.endswith(araby.HAMZA) and suffix_nm: if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA # this option is not used with stop words, because most of them are not inflected مبني #if the word ends by a haraka strip the haraka if the suffix is not null if suffix and suffix[0] in araby.HARAKAT: word_stem = araby.strip_lastharaka(word_stem) # الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا if suffix.startswith( araby.NOON) and word.endswith(araby.NOON + araby.SUKUN): word_stem = araby.strip_lastharaka(word_stem) elif suffix.startswith(araby.KASRA + araby.YEH) and word.endswith(araby.YEH + araby.SUKUN): word_stem = araby.strip_lastharaka(word_stem) return word_stem
def get_noun_attributes(self, word): """ return vocalized form """ vocalized = word word_nm = araby.strip_tashkeel(word) foundlist = self.noun_dict.lookup(word_nm) word_tuple_res = None for word_tuple in foundlist: word_tuple = dict(word_tuple) # if found the same vocalization word_tuple_res = word_tuple break else: # no vocalization, try the first one if foundlist: word_tuple_res = dict(foundlist[0]) else: word_tuple_res = {"vocalized": word} return word_tuple_res
def process(text): text = araby.strip_tashkeel(text) #delete *tashkil text = re.sub('\ـ+', ' ', text) # delete letter madda text = re.sub('\ر+', 'ر', text) # duplicate ra2 text = re.sub('\اا+','ا',text) #duplicate alif text = re.sub('\ووو+','و',text) #duplicate waw (more than 3 times goes to 1 text = re.sub('\ههه+','ههه',text) #duplicate ha2 (more than 3 times goes to 1 text = re.sub('\ةة+','ة',text) text = re.sub('\ييي+','ي',text) text = re.sub('أ','ا',text) # after to avoid mixing text = re.sub('آ','ا',text) # after to avoid mixing text = re.sub('إ','ا',text) # after to avoid mixing text = re.sub('ة','ه',text) # after ةة to avoid mixing ههه text = re.sub('ى','ي',text) text = " ".join(text.split()) #delete multispace return text
def detect_number_words(text): """ Detect number words in a text. Example: >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا") خمسمئة وثلاثة وعشرين @param text: input text @type text: unicode @return: number words extracted from text @rtype: integer """ phrases_context = extract_number_context(text) for ph_con in phrases_context: if len(ph_con) >= 3: previous = ph_con[0] phrase = ph_con[1] nextword = ph_con[2] numberedwords = phrase numeric = text2number(numberedwords) tags = get_previous_tag(previous) wordlist = araby.strip_tashkeel(numberedwords).split(' ') vocalized = vocalize_number(wordlist, tags) #calcul vocalization similarity : sim = araby.vocalized_similarity(numberedwords, vocalized) voc_unit = vocalize_unit(numeric, nextword) sim_unit = araby.vocalized_similarity(voc_unit, nextword) if sim < 0: #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized, #~ str(numeric), u' '.join([previous, phrase, nextword]), #~ nextword, voc_unit, str(sim_unit)]).encode('utf8') print('\t'.join( [str(sim), ' '.join(numberedwords), ' '.join(vocalized)]).encode('utf8')) print(str(numeric), ' '.join([previous, phrase, nextword]).encode('utf8')) print('\t'.join([nextword, voc_unit, str(sim_unit)]).encode('utf8'))
def text2number(text): """ Convert arabic text into number, for example convert تسعة وعشرون = >29. Example: >>> text2number(u"خمسمئة وثلاث وعشرون") 523 @param text: input text @type text: unicode @return: number extracted from text @rtype: integer """ #the result total is 0 total = 0 # the partial total for the three number partial = 0 text = araby.strip_tashkeel(text) words = text.split(' ') #print words for word in words: if word and word != 'واحد' and \ word[0] in ('و', 'ف', 'ل', 'ب', 'ك'): word = word[1:] if word != 'واحد' and word.startswith('و'): word = word[1:] if word in nbconst.NumberWords: actualnumber = nbconst.NumberWords[word] if actualnumber % 1000 == 0: # the case of 1000 or 1 million if partial == 0: partial = 1 total += partial * actualnumber #re-initiate the partial total partial = 0 else: partial += nbconst.NumberWords[word] # add the final partial to total total += partial return total
def get_verb_info(self, verb_tuple): """ Get verb information """ # get verb subclass verb_nm = araby.strip_tashkeel(verb_tuple['vocalized']) verb_class = "" if verb_nm.startswith(araby.WAW): verb_class = "W1W" #"Mithal_W" elif verb_nm[-2:-1] == araby.ALEF: # before last char if verb_tuple['future_type'] in (araby.DAMMA, u"ضمة"): verb_class = "W2W" #"Adjwaf_W" elif verb_tuple['future_type'] in (araby.KASRA, u"كسرة"): verb_class = "W2Y" #"Adjwaf_Y" elif verb_nm[-1:] in (araby.YEH, araby.ALEF_MAKSURA): verb_class = "W3Y" #"Naqis_Y" elif verb_nm[-1:] == araby.ALEF: verb_class = "W3W" #"Naqis_W" elif araby.SHADDA in (verb_tuple['vocalized']): verb_class = "Dbl" # doubled else: verb_class = "-" # the passive tenses dont take object suffix, only with double transitie verbs tags = "V." + verb_class + "." if verb_tuple['transitive']: tags += "T" else: tags += "I" if verb_tuple['double_trans']: tags += "D" elif verb_tuple['think_trans']: tags += "T" elif verb_tuple['reflexive_trans']: tags += "R" # tags pronouns else: tags += '-' return tags
def _old_preprocess(self, text, do_farasa_tokenization): """ AraBERTv1 preprocessing Function """ text = str(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(regex_url_step1, "[رابط]", text) text = re.sub(regex_url_step2, "[رابط]", text) text = re.sub(regex_url, "[رابط]", text) text = re.sub(regex_email, "[بريد]", text) text = re.sub(regex_mention, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) if self.remove_elongation: text = self._remove_elongation(text) if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text, ) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) return text.strip()
def display_word_seg(xmldoc, keep_tashkeel=False): """ extract all possible segmentations return a dict of lists, the key is the word """ word_dict = {} # get the annuaire list words = xmldoc.getElementsByTagName('w') #~ print words cpt = 0 # display a word for word in words: # every word contains choices word_value = word.getAttribute("rend") word_dict[word_value] = [] choices = word.getElementsByTagName('choice') for choice in choices: #~ print choice.toxml() segs = choice.getElementsByTagName('seg') for seg in segs: #~ print seg.toxml() members = choice.getElementsByTagName('m') #~ segment={"word":word_value, } segment={} for mmbr in members: mmbr_type = mmbr.getAttribute('type') try: mmbr_value = mmbr.firstChild.data except: mmbr_value = "" if keep_tashkeel: segment[mmbr_type] = mmbr_value else: segment[mmbr_type] = ar.strip_tashkeel(mmbr_value) word_dict[word_value].append(segment) # strip tashkeel generate duplicates segments if not keep_tashkeel: word_dict[word_value] = remove_duplicate(word_dict[word_value]) #~ print (repr(segment)).decode('unicode-escape'); return word_dict
def preprocess(text, do_farasa_tokenization=False): text=str(text) processing_tweet = araby.strip_tashkeel(text) processing_tweet = re.sub(r'\d+\/[ء-ي]+\/\d+\]', '', processing_tweet) #processing_tweet = re.sub(r'\d+([,\d]+)?', '[رقم]', processing_tweet) processing_tweet = re.sub('ـ', '', processing_tweet) processing_tweet = re.sub(regex_url, '[رابط]', processing_tweet) processing_tweet = re.sub(regex_email, '[بريد]', processing_tweet) processing_tweet = re.sub(regex_mention, '[مستخدم]', processing_tweet) processing_tweet = re.sub('…', r'\.', processing_tweet).strip() processing_tweet = remove_redundant_punct(processing_tweet) #processing_tweet = re.sub(r'\[ رقم \]|\[رقم \]|\[ رقم\]', ' [رقم] ', processing_tweet) processing_tweet = re.sub(r'\[ رابط \]|\[ رابط\]|\[رابط \]', ' [رابط] ', processing_tweet) processing_tweet = re.sub(r'\[ بريد \]|\[ بريد\]|\[بريد \]', ' [بريد] ', processing_tweet) processing_tweet = re.sub(r'\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]', ' [مستخدم] ', processing_tweet) processing_tweet = remove_elongation(processing_tweet) if do_farasa_tokenization: processing_tweet = tokenize_arabic_words_farasa(processing_tweet) return processing_tweet.strip()
def add(self, word): """ add a new vocalization given by user for unrecongnized word @return: vocalized word @rtype: none """ word_nm = araby.strip_tashkeel(word) if word_nm not in self.dictio: self.dictio[word_nm] = [ word, ] else: if word not in self.dictio[word_nm]: self.dictio[word_nm].append(word) try: self.cdfile = open(self.filename, "a+") text = u"%s\t%s\n" % (word_nm, u':'.join(self.dictio[word_nm])) self.cdfile.write(text.encode('utf8')) self.cdfile.close() except: print("updating:can't update cutom dictionary'")
def one_word_tagging(self, word, previous=u"", second_previous=u""): """ Guess word classification, into verb, noun, stopwords. return a guessed tag @param word: the given word. @type word: unicode. @return: a tag : 't': tool, 'v': verb, 'n' :noun, 'nv' or 'vn' unidentifed. @rtype: unicode """ if not word: return "" else: word_nm = araby.strip_tashkeel(word) tag = '' if self.cache.has_key(word): tag = self.cache.get(word, '') else: if self.is_stopword(word): tag = 't' else: if self.is_noun(word): tag += 'n' if self.is_verb(word): tag += 'v' # add the found tag to Cache. self.cache[word] = tag # if the tagging give an ambigous tag, # we can do an contextual analysis # the contextual tag is not saved in Cache, # because it can be ambigous. # for example # في ضرب : is a noun # قد ضرب : is a verb if tag in ("", "vn", "nv"): tag = self.context_analyse(previous, word) + "2" if tag in ("", "1", "vn1", "nv1"): tag = self.context_analyse( u" ".join([second_previous, previous]), word) + "3" return tag
def prepare_dataset(self): data = {} counter = 0 with open(self.file, encoding='utf8') as file: for line in file: letter_ids = [] diacritic_ids = [] word_ids = [] letters, diacritics = araby.separate(line) letters = letters[0:-1] words = araby.tokenize(line)[0:-1] diacritics = diacritics[0:-1] for letter in letters: if (letter == '\n') or (letter == '\u200f'): continue letter_ids.append(self.letter_to_id[letter]) for index, diacritic in enumerate(diacritics): if letters[index] == " ": diacritic_ids.append(self.diacritic_to_id['space']) else: diacritic_ids.append(self.diacritic_to_id[diacritic]) for word in words: word_ids.append( self.word_to_id[araby.strip_tashkeel(word)]) instance = (torch.tensor(letter_ids, dtype=torch.long, requires_grad=False), torch.tensor(diacritic_ids, dtype=torch.long, requires_grad=False), torch.tensor(word_ids, dtype=torch.long, requires_grad=False)) data[counter] = instance counter += 1 return data
def preprocess(text, do_farasa_tokenization=True, farasa=None): """ Preprocess takes an input text line an applies the same preprocessing used in araBERT pretraining Args: text (string): inout text string farasa (JavaGateway): pass a py4j gateway to the FarasaSegmenter.jar file Example: from py4j.java_gateway import JavaGateway gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar') farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa() processed_text = preprocess("Some_Text",do_farasa_tokenization=True , farasa=farasa) """ text = str(text) processing_tweet = araby.strip_tashkeel(text) processing_tweet = re.sub(r'\d+\/[ء-ي]+\/\d+\]', '', processing_tweet) processing_tweet = re.sub('ـ', '', processing_tweet) processing_tweet = re.sub('[«»]', ' " ', processing_tweet) #replace the [رابط] token with space if you want to clean links processing_tweet = re.sub(regex_url_step1, '[رابط]', processing_tweet) processing_tweet = re.sub(regex_url_step2, '[رابط]', processing_tweet) processing_tweet = re.sub(regex_url, '[رابط]', processing_tweet) processing_tweet = re.sub(regex_email, '[بريد]', processing_tweet) processing_tweet = re.sub(regex_mention, '[مستخدم]', processing_tweet) processing_tweet = re.sub('…', r'\.', processing_tweet).strip() processing_tweet = remove_redundant_punct(processing_tweet) processing_tweet = re.sub(r'\[ رابط \]|\[ رابط\]|\[رابط \]', ' [رابط] ', processing_tweet) processing_tweet = re.sub(r'\[ بريد \]|\[ بريد\]|\[بريد \]', ' [بريد] ', processing_tweet) processing_tweet = re.sub(r'\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]', ' [مستخدم] ', processing_tweet) processing_tweet = remove_elongation(processing_tweet) if do_farasa_tokenization and farasa is not None: processing_tweet = tokenize_arabic_words_farasa( processing_tweet, farasa) return processing_tweet.strip()
def verb_stamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word = ar.strip_tashkeel(word) #The vowels are striped in stamp function word = ar.normalize_hamza(word) if word.startswith(ar.HAMZA): #strip The first hamza word = word[1:] # strip the last letter if is doubled if word[-1:] == word[-2:-1]: word = word[:-1] return self.verb_stamp_pat.sub('', word)
def generateSuggest(self, word): """ Generate word suggestion @param word: input text. @type word: unicode. @return: generated suggestion. rtype: list of words. """ wordlist = [word, araby.strip_tashkeel(word)] codidates = self.edits1(word) for condidate in codidates: if True: #self.accepted(condidate): wordlist.append(condidate) # commun letters error remplacement for tup in spellcheck_const.TabReplacment: sug = word.replace(tup[0], tup[1]) if sug != word: # evaluate generated suggestion if self.accepted(sug): wordlist.append(sug) wordlist = list(set(wordlist)) return wordlist
def ARPosTag(self, List): patterns = [ ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'), ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'), ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'), ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'), ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'), ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'), ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'), ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'), ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'), ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'), ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'), ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'), ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'), ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'), #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'), ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'), ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'), ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'), ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'), ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'), ('','اسم'), ] reg = RegexpTagger(patterns) tmpList = [] for k in List: tmp = araby.strip_tashkeel(k) tmp2='' for i in self.s2: if tmp.endswith(i): a=2 tmp2=tmp[0:-a] else: tmp2=tmp tmpList.append(tmp2) return reg.tag(tmpList)
def get_suffix_variants(word, suffix, enclitic, mankous=False): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @param mankous: if the noun is mankous ends with Yeh منقوص. @type mankous: boolean. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and araby.is_haraka(suffix): if word[-1:] in (araby.YEH, araby.ALEF): newsuffix = u"" elif mankous: # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل # تحول حركته إلى تنوين كسر newsuffix = araby.KASRATAN #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def get_suffix_variant(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is convert to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffix. @rtype: unicode. """ enclitic_nm = araby.strip_tashkeel(enclitic) #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0: suffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) if enclitic_nm == u"" and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and suffix in araby.HARAKAT: suffix = u"" return suffix
def validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm): """ Test if the given word from dictionary is compabilbe with affixes tags. @param noun_tuple: the input word attributes given from dictionary. @type noun_tuple: dict. @param affix_tags: a list of tags given by affixes. @type affix_tags:list. @param procletic: first level prefix vocalized. @type procletic: unicode. @param encletic_nm: first level suffix vocalized. @type encletic_nm: unicode. @param suffix_nm: first level suffix vocalized. @type suffix_nm: unicode. @return: if the tags are compatible. @rtype: Boolean. """ procletic = araby.strip_tashkeel(procletic) encletic = encletic_nm suffix = suffix_nm if u'تنوين' in affix_tags and noun_tuple['word_type'] == "noun_prop": return False return True
word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ]
def test(): options = grabargs() filename = options['fname'] outfilename = options['ofname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] enable_syn_train = options['train'] # filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) print("input file:", filename) if not outfilename: outfilename = filename + " (Tashkeel).txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache: vocalizer.disable_cache() # print "nocache" if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled # vocalizer.disableShowCollocationMark() # print "show delimiter", vocalizer.collo.showDelimiter # nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: # dispaly stats for the current line print( "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine" ) while line and (nolimit or counter <= limit): if not line.startswith('# '): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: # vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) # stemmer = tashaphyne.stemming.ArabicLightStemmer() # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) # ~inputlist = [] # ~for txt in texts: # ~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [ x.get("semi", '') for x in vocalized_dict ] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip( inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity( inword, outsemiword) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 # display stat for every line if compare: print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, # id round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), # Strip Correct incorrect, # fully WER WLMIncorrect, # Strip WER LettersError, # LER total # Total )) if lineTotal: print("%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2) ) # line Fully correct print("%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2) ) # line Strip correct # ~ print result.strip('\n').encode('utf8'), if text: print result.strip('\n').encode('utf8'), else: result_line = result.encode('utf8') print result_line # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress and not nolimit: # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write( "\r[%d%%]%d/%d lines Full %0.2f Strip %0.2f " % ( counter * 100 / limit, counter, limit, round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2) # Strip Correct )) # ~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() # get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None else: print("Done")
def detect_chunks(self, wordlist): """ Detect named enteties words in a text and return positions of each phrase. Example: >>> detect_chunk(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله") ((1,3), (6,8)) @param wordlist: wordlist @type wordlist: unicode list @return: list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple """ started = False taglist = [] previous = "" wordlist, wordtag_list = self.preprocess(wordlist) for i, word_voc in enumerate(wordlist): # get previous tag and next prev_tag = wordtag_list[i-1] if i>0 else "" next_tag = wordtag_list[i+1] if i<len(wordtag_list)-1 else "" #save the original word with possible harakat if exist word = araby.strip_tashkeel(word_voc) if not started: # if the word is a start tag word # a word which is a start tag like proper noun or an indicator # الكلمة السابقة ليست موسومة، لكن الكلمة الحالية تجعلها كذلك if self.is_middle_tuple_tag(word, previous): taglist.pop() taglist.append(self.begintag) taglist.append(self.intertag) started = True #:كلمة توسم بنفسها elif self.is_wordtag(word): taglist.append(self.begintag) started = True elif self.is_starttag(word): taglist.append(self.begintag) started = True else: taglist.append("0") started = False else: # توجد سلسلة منطلقة # الكلمة السابقة ليست موسومة، لكن الكلمة الحالية تجعلها كذلك if self.is_middle_tuple_tag(word, previous, next_tag): taglist.append(self.intertag) #:كلمة توسم بنفسها elif self.is_wordtag(word): taglist.append(self.intertag) # الكلمة لا تكون موسومة إلا إذا كانت مسبوقة elif self.is_middle_wordtag(word, next_tag): # إذا كانت في آخر الجملة لا ت #~ taglist.append(self.intertag+"3") taglist.append(self.intertag) else: taglist.append("0") started = False; previous = word wordlist, taglist = self.postprocess(wordlist, taglist) return taglist