def main(args): generator = alyahmor.genelex.genelex() print('NOUN_AFFIX_LIST=') noun_affixes = generator.generate_noun_affix_list() print(arepr(noun_affixes).replace(',', ',\n')) print('VERB_AFFIX_LIST=') verb_affixes = generator.generate_verb_affix_list() print(arepr(verb_affixes).replace(',', ',\n')) return 0
def test_affix(): generator = alyahmor_genelex.genelex() word = u"قصد" wtype = "verb" list_forms = generator.generate_affix_list(word_type=wtype, indexed=True) print(arepr(list_forms).replace('),', '),\n').replace('],', '],\n')) wtype = "noun" print('********* Noun ************') list_forms = generator.generate_affix_list(word_type=wtype, indexed=True) print(arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
def add_footer(self): """close the data set, used for ending xml, or sql""" text = "STOPWORDS=" text += arepr(self.STOPWORDS).decode('utf8') if self.generate_all_forms : text += "\n\nSTOPWORDS_INDEX=" text += arepr(self.STOPWORDS_INDEX).decode('utf8') # add newlines for more readability text = text.replace('}],', '}],\n') text = text.replace('],', '],\n') text = text.replace('),', '),\n') return text
def log(self, data, msg=""): """ display internal data""" if not self.debug: return False else: print(msg) print(arepr(data))
def main(args): df = pd.read_csv( "samples/Arabic-patterns/Arabic-patterns-tabbed-v2.txt", encoding='utf8', delimiter='\t', ) outfile = "output/Arabic-patterns-tabbed.csv" # preprocess columns for name in df.columns.values: #~ print name df[name] = df[name].apply(preprocess) # convert trans df["pattern"] = df["pattern"].apply(arabtrans.tim2utf8) df.to_csv(outfile + "debug", sep='\t', encoding='utf-8') df["singularPattern"] = df["singularPattern"].apply(arabtrans.tim2utf8) df['rhyzome'] = df['pattern'].apply(extract_rhyzome) df['unvocalized'] = df['pattern'].apply(araby.strip_tashkeel) df['weak'] = df['rhyzome'].apply(classify_rhyzome) df['weak'] = df['rhyzome'].apply(classify_rhyzome) print(df.head()) #~ generate_rooton_list() df.sort_values(by=['rhyzome'], ascending=True, inplace=True) df2 = df[[ 'rhyzome', 'pattern', 'weak', 'examples' ]] #, 'singularPattern', 'type', 'nType', 'vType', 'isBrokenPlural', 'hasBrokenPlural', 'hasFem', 'subOf','examples'] df2 = df[[ 'rhyzome', 'unvocalized', 'pattern', 'weak', 'examples' ]] #, 'singularPattern', 'type', 'nType', 'vType', 'isBrokenPlural', 'hasBrokenPlural', 'hasFem', 'subOf','examples'] df2.to_csv(outfile, sep='\t', encoding='utf-8') rhyzomes = list(df['rhyzome'].unique()) print(arepr(rhyzomes)) return 0
def main(args): word = u"قَصْدٌ" noun_forms = generate_noun_forms(word) #~ print(arepr(noun_forms).replace('),', '),\n')) #~ print('************verb*****') word = u"قصد" verb_forms =generate_verb_forms(word) #~ print(arepr(verb_forms).replace('),', '),\n')) print ('NOUN_AFFIX_LIST=') noun_affixes = generate_noun_affix_list() print(arepr(noun_affixes).replace(',', ',\n')) print('VERB_AFFIX_LIST=') verb_affixes = generate_verb_affix_list() print(arepr(verb_affixes).replace(',', ',\n')) return 0
def test_generate_one(tuple_list): generator = alyahmor_genelex.genelex() for word, wtype, affixes in tuple_list: affixes = affixes.split("-") list_forms = generator.generate_by_affixes(word, word_type=wtype, affixes=affixes) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
def test_rooter3(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne asl = abstractstemmer.customStemmer_roots_rhyzome() # debug in rhyzome rooter asl.rootdict.rhyzome_rooter.debug = True df = dataframe_result # avoid null roots total = len(df.index) cpt = 0 for word, root in zip(df["word"], df["root"]): root_list = root.split(';') if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) asl.light_stem(word) default_root = asl.get_root() starword = asl.get_starword() asl.segment(word) affixa_list = asl.get_affix_list() # filter valid affixes affixa_list = filter(asl.verify_affix, affixa_list) #~ root_result = rootslib.choose_root(affixation_list) if True: stems = [d['stem'] for d in affixa_list] roots = [d['root'] for d in affixa_list] print((u"**********%s*********" % word).encode('utf8')) print((u"Start Word : %s" % starword).encode('utf8')) print("Stems: " + u' '.join(stems).encode('utf8')) print((u"Dafault roots: [%s] a %s" % (default_root, u' '.join(roots))).encode('utf8')) print(arepr(affixa_list)) root_result = asl.rootdict.choose_root(word, affixa_list, debug=True) else: root_result = stop_root(word) roots = [] stems = [] startword = "" default_root = "" affixa_list = [] if root_result in root_list: cpt += 1 if True: print((u" ".join([ u"Test root", root, u"found root", root_result, str(root_result in root_list) ])).encode('utf8')) print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
def test(): # readfile filename = "samples/majdi-patterns.csv" outfile = "output/majdi-patterns.csv" try: df = pd.read_csv(filename, delimiter='\t', #~ names=['word', 'root', 'lemma', 'type','non'], encoding = "utf-8", #~ skiprows=1, ) except: print " Can't Open the given File ", filename; sys.exit(); print(df.head()) df['rhyzome'] = df['Pattern'].apply(extract_rhyzome) print('**********after rhyzome******') print(df.head()) print(df.head(100)) # save file on csv df.to_csv(outfile, sep='\t', encoding='utf-8') rhyzomes = list(df['rhyzome'].unique()) # filter some rhyzomes # avoid some patterns rhyzomes = [r for r in rhyzomes if not( araby.HEH in r or araby.MEEM in r or araby.NOON in r)] print(u"****rhyzomes****") print(u"\n".join(rhyzomes).encode('utf8')) print(len(rhyzomes)) reduced = [] for r in rhyzomes: reduced.extend(make_weak_rhyzome(r)) reduced = list(set(reduced)) reduced = [x for x in reduced if x not in rhyzomes] print(u"****reduced****") print(u"\n".join(reduced).encode('utf8')) print(len(reduced)) rhyzomes.extend(reduced) print('In wazns not in Rhyzomes') diff2 = [x for x in WAZNS if x not in rhyzomes] print(arepr(diff2)) print('RHYZOMES=') print(arepr(rhyzomes))
def main(args): generator = genelex.genelex() print('NOUN_AFFIX_LIST='), noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False) print(arepr(noun_affixes).replace(',', ',\n')) print('VERB_AFFIX_LIST='), verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False) print(arepr(verb_affixes).replace(',', ',\n')) # print prefixes and affixes noun_prefixes, noun_suffixes = generator.generate_prefix_suffix_list( word_type="noun", vocalized=False) print('NOUN_PREFIX_LIST='), print(arepr(noun_prefixes).replace(',', ',\n')) print('NOUN_SUFFIX_LIST='), print(arepr(noun_suffixes).replace(',', ',\n')) verb_prefixes, verb_suffixes = generator.generate_prefix_suffix_list( word_type="verb", vocalized=False) print('VERB_PREFIX_LIST='), print(arepr(verb_prefixes).replace(',', ',\n')) print('VERB_SUFFIX_LIST='), print(arepr(verb_suffixes).replace(',', ',\n')) return 0
def test(tuple_list): generator = alyahmor_genelex.genelex() for word, wtype in tuple_list: print('************%s*****' % wtype) list_forms = generator.generate_forms(word, word_type=wtype) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n')) list_forms = generator.generate_forms(word, word_type=wtype, vocalized=False) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n')) list_forms = generator.generate_forms(word, word_type=wtype, indexed=True) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n')) list_forms = generator.generate_affix_list(word_type=wtype, indexed=True) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
def print_tuple(self, vdict): """ convert tuple to string """ if type(vdict) is list: print arepr(vdict) if "text" in vdict: return vdict['text'] else: return u'\t'.join([ vdict["word"], vdict["triliteral"], vdict["root"], vdict["future_type"], vdict["transitive"], str(vdict["nb_trans"]), vdict["object_type"], vdict["reflexive_type"], vdict["tenses"], vdict["model"], str(vdict["nb_case"]), vdict["verb_cat"], vdict["suggest"]])
def main(args): args = grabargs() filename = args.filename outfile = args.outfile try: myfile = open(filename) except: print("Can't Open file %s" % filename) sys.exit() lines = myfile.readlines() debug = True limit = 500 generator = alyahmor.genelex.genelex() #~ words = araby.tokenize(text) tuple_list = [l.decode('utf8').strip().split('\t') for l in lines] for word, wtype in tuple_list: if wtype == "noun": print('************Noun*****') noun_forms = generator.generate_noun_forms(word) print(arepr(noun_forms).replace('),', '),\n')) if wtype == "verb": print('************verb*****') verb_forms = generator.generate_verb_forms(word) print(arepr(verb_forms).replace('),', '),\n'))
def stemming_verb(self, verb_in): """ Stemming verb @param verb_in: given verb @type verb_in: unicode @return : stemmed words @rtype: """ if not verb_in: return None #~ list_found = [] detailed_result = [] verb_list = [ verb_in, ] + self.get_verb_variants(verb_in) verb_list = list(set(verb_list)) debug = self.debug #list of segmented words word_segmented_list = [] for verb in verb_list: list_seg_comp = self.comp_stemmer.segment(verb) for seg in list_seg_comp: proclitic = verb[:seg[0]] stem = verb[seg[0]:seg[1]] enclitic = verb[seg[1]:] #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8') #~secondsuffix = u'' # حالة الفعل المتعدي لمفعولين if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX: firstsuffix = \ SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first'] enclitic = firstsuffix list_stem = [stem] + self.get_in_stem_variants(stem, enclitic) #if enclitic, then transitive is ok transitive_comp = bool(enclitic) for stm in list_stem: word_seg = { "verb": verb, "pro": proclitic, "enc": enclitic, 'stem_comp': stm, 'trans_comp': transitive_comp, } word_segmented_list.append(word_seg) if debug: print("after first level") if debug: #~ print(repr(word_segmented_list).replace( #~ '},', '},\n').decode("unicode-escape")) print(arepr(verb_in)) print(print_table(word_segmented_list)) # second level for segmented word tmp_list = [] #~ print 'first level', verb_in, len(word_segmented_list) for word_seg in word_segmented_list: verb2 = word_seg['stem_comp'] # stem reduced verb : level two #segment the conjugated verb list_seg_conj = self.conj_stemmer.segment(verb2) # verify affix compatibility list_seg_conj = self.verify_affix(verb2, list_seg_conj, SVC.VERBAL_CONJUGATION_AFFIX) # verify proclitics and enclitecs # verify length pof stem for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0]) <= 6: #word seg in level 2 word_seg_l2 = word_seg.copy() word_seg_l2["prefix"] = verb2[:seg_conj[0]] word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]] word_seg_l2["suffix"] = verb2[seg_conj[1]:] tmp_list.append(word_seg_l2) # verify compatibilty between proclitic and affixes word_segmented_list = tmp_list #~ print 'compatibility', verb_in, len(tmp_list) tmp_list = [] for word_seg in word_segmented_list: # verify compatibility between proclitics and affixes proclitic = word_seg['pro'] enclitic = word_seg['enc'] affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']]) if self.__check_clitic_affix(proclitic, enclitic, affix_conj): tmp_list.append(word_seg.copy()) #~ print 'stamp', verb_in, len(tmp_list) # verify existance of condidate verb by stamp word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # verify existance of condidate verb by stamp if self.exists_as_stamp(word_seg['stem_conj']): tmp_list.append(word_seg.copy()) if debug: print("after second level") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print 'infinitive', verb_in, len(tmp_list) # get infinitive of condidate verbs word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get infinitive of condidate verb by stamp # search the verb in the dictionary by stamp # if the verb exists in dictionary, # The transitivity is consedered # if is trilateral return its forms and Tashkeel # if not return forms without tashkeel, #because the conjugator can vocalized it, # we can return the tashkeel if we don't need the #conjugation step infverb_dict = self.__get_infinitive_verb_by_stem( word_seg['stem_conj'], word_seg['trans_comp']) if debug: print("infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) #~ print "list possible verbs", len(infverb_dict) #~ for item in infverb_dict: #~ print item['verb'] # filter verbs infverb_dict = self.__verify_infinitive_verbs( word_seg['stem_conj'], infverb_dict) if debug: print("valid infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) for item in infverb_dict: #The haraka from is given from the dict word_seg_l3 = word_seg.copy() word_seg_l3['inf'] = item['verb'] word_seg_l3['haraka'] = item['haraka'] word_seg_l3['root'] = item.get('root', '') word_seg_l3['transitive'] = bool(item['transitive'] in ('y', 1)) tmp_list.append(word_seg_l3) # conjugation step if debug: print("after lookup dict") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape") #~ print 'conj', verb_in, len(tmp_list) # get conjugation for every infinitive verb word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # ToDo, conjugate the verb with affix, # if exists one verb which match, return it # تصريف الفعل مع الزوائد # إذا توافق التصريف مع الكلمة الناتجة # تعرض النتيجة one_correct_conj = self.__generate_possible_conjug( word_seg['inf'], word_seg['stem_comp'], word_seg['prefix'] + '-' + word_seg['suffix'], word_seg['haraka'], word_seg['pro'], word_seg['enc'], word_seg['transitive']) #~ print "len correct_conj", len(one_correct_conj) for conj in one_correct_conj: word_seg_l4 = word_seg.copy() word_seg_l4['conj'] = conj.copy() tmp_list.append(word_seg_l4) if debug: print("after generating conjugation") if debug: print(arepr(verb_in)) conjs = [item['conj'] for item in tmp_list] print(print_table(conjs)) #~ print 'result', verb_in, len(tmp_list) # generate all resulted data word_segmented_list = tmp_list #~ tmp_list = [] for word_seg in word_segmented_list: conj = word_seg['conj'] #~ vocalized, semivocalized = self.vocalize( vocal_tuple_list = self.vocalize(conj['vocalized'], word_seg['pro'], word_seg['enc']) tag_type = 'Verb' original_tags = "y" if conj['transitive'] else "n" # ~ print("stem_verb", vocal_tuple_list) for vocalized, semivocalized, __ in vocal_tuple_list: # ~ for XXX in vocal_tuple_list: # prepare tags tags = self.prepare_tags(conj, proclitic, enclitic) detailed_result.append( wordcase.WordCase({ 'word': word_seg['verb'], 'affix': (word_seg['pro'], word_seg['prefix'], word_seg['suffix'], word_seg['enc']), 'stem': word_seg['stem_conj'], 'root': ar.normalize_hamza(word_seg.get('root', '')), 'original': conj['verb'], 'vocalized': vocalized, 'semivocalized': semivocalized, 'tags': tags, #\ 'type': tag_type, 'number': conj['pronoun_tags'].get('number', ''), 'gender': conj['pronoun_tags'].get('gender', ''), 'person': conj['pronoun_tags'].get('person', ''), 'tense2': conj['tense_tags'].get('tense', ''), 'voice': conj['tense_tags'].get('voice', ''), 'mood': conj['tense_tags'].get('mood', ''), 'confirmed': conj['tense_tags'].get('confirmed', ''), 'transitive': conj['transitive'], 'tense': conj['tense'], 'pronoun': conj['pronoun'], 'freq': 'freqverb', 'originaltags': original_tags, 'syntax': '', })) return detailed_result
u"جاء مليونان وألفان وإثنا عشر", u"وجدت خمسمئة وثلاث وعشرون دينارا", u"خمسمئة وثلاث وعشرون دينارا", u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا", u"لم أجد شيئا", u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا", u'من ثلاثمئة وخمسين بلدا ', u'من ثلاثمئة وخمسين بلدا ', u'من أربعمئة وخمسين بلدا ', u'السلام عليكم 2014', ] #~ arepr = arabrepr.ArabicRepr() for txt in TEXTS: word_list = araby.tokenize(txt) positions_phrases = detect_number_phrases_position(word_list) print(positions_phrases) nb_phrases = extract_number_phrases(txt) tag_list = detect_numbers(word_list) print(tag_list) print(u" ".join(word_list)) print(zip(tag_list, word_list)) print('tashkeel') tashkeel = u" ".join(pre_tashkeel_number(word_list)) if sys.version_info < (3, 0): print(tashkeel.encode('utf8')) else: print(tashkeel) print(u'\t'.join(nb_phrases)) print("detect number word") print(arabrepr.arepr(detect_number_words(txt)))
return [] def main(args): word = u"لعلهم" print stop_stem(word) return 0 if __name__ == '__main__': import sys from pyarabic.arabrepr import arepr words = [ (u'منكم', True), (u'ممكن', False), (u'عندما', True), (u'حينئذ', True), ] for w, rep in words: result = is_stop(w) if result != rep: print((u"Error %s is %swhere must be %s" % (w, result, rep)).encode('utf8')) print(len(stopwords_list())) print(len(classed_stopwords_list())) print(arepr(stopword_forms(u'حتى'))) print(arepr(stopword_forms(u'جميع'))) print(arepr(stop_stem(u'لجميعهم'))) print(arepr(stop_stem(u'لجم')))
targets = [x.strip() for x in item[1:] if x.strip() ] targets = [araby.strip_tashkeel(x.strip()) for x in targets if x] word_list = araby.tokenize(text1) tag_list2 = chunker.detect_chunks(word_list) result = chunker.extract_chunks(text1) equal, inequal = eval_score(targets, result) print("Equal",equal, inequal) tests['correct'] += equal tests['incorrect'] += inequal if inequal and debug: # debug print("ID"+str(key), text1.encode('utf8')) print("result") print(arepr(result)) print("target") print(arepr(targets)) #~ result2 = chunker.detect_chunks(word_list) #~ print(arepr(result2)) result2 = chunker.detect_positions(word_list, debug=True) print(arepr(result2)) #~ tuples = (zip(tag_list2, word_list)) #~ for tup in tuples: #~ print(repr(tup).decode('unicode-escape').encode('utf8')) # tests for item in test_texts: text1 = item[0]
def test2(tuple_list): generator = alyahmor_genelex.genelex() for word, wtype in tuple_list: list_forms = generator.generate_forms(word, word_type=wtype) print( arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
def display_all(self): """ display all contents of data base """ #~ pass print "aranasyn.cache: dislay all records in Thaalib Database """ for curr in self.db.all('a', with_doc=True): print curr['doc']['a'], arepr(curr['doc']['d'])
u"جاء مليونان وألفان وإثنا عشر", u"وجدت خمسمئة وثلاث وعشرون دينارا", u"خمسمئة وثلاث وعشرون دينارا", u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا", u"لم أجد شيئا", u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا", u'من ثلاثمئة وخمسين بلدا ', u'من ثلاثمئة وخمسين بلدا ', u'من أربعمئة وخمسين بلدا ', u'السلام عليكم 2014', ] #~ arepr = arabrepr.ArabicRepr() for txt in TEXTS: word_list = araby.tokenize(txt) positions_phrases = detect_number_phrases_position(word_list) print(positions_phrases) nb_phrases = extract_number_phrases(txt) tag_list = detect_numbers(word_list) print(tag_list) print(u" ".join(word_list)) print(zip(tag_list, word_list)) print('tashkeel') tashkeel = u" ".join(pre_tashkeel_number(word_list)) if sys.version_info < (3, 0): print(tashkeel.encode('utf8')) else: print(tashkeel) print(u'\t'.join(nb_phrases)) print("detect number word") print(arabrepr.arepr(detect_number_words(txt)))
def stemming_noun(self, noun_in): """ Analyze word morphologically as noun @param noun_in: the input noun. @type noun_in: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ self.set_error_code('') if not noun_in: self.set_error_code('Empty word') return None debug = self.debug #~list_found = [] detailed_result = [] noun_list = [ noun_in, ] + self.get_noun_variants(noun_in) word_segmented_list = [] for noun in noun_list: list_seg_comp = self.comp_stemmer.segment(noun) # filter list_seg_comp = self.verify_affix(noun, list_seg_comp, SNC.COMP_NOUN_AFFIXES) # treat multi vocalization enclitic for seg in list_seg_comp: proclitic_nm = noun[:seg[0]] stem = noun[seg[0]:seg[1]] enclitic_nm = noun[seg[1]:] # ajusting nouns variant list_stem = [ stem, ] + self.get_input_stem_variants(stem, enclitic_nm) # stem reduced noun : level two for stem in list_stem: word_seg = { 'noun': noun, 'stem_comp': stem, 'pro': proclitic_nm, 'enc': enclitic_nm, } word_segmented_list.append(word_seg) if not word_segmented_list: self.set_error_code(" First level segmentation error") # level two tmp_list = [] if debug: print("after first level") if debug: #~ print(repr(word_segmented_list).replace( #~ '},', '},\n').decode("unicode-escape")) print(arepr(noun_in)) print(print_table(word_segmented_list)) for word_seg in word_segmented_list: #~ detailed_result.extend( #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'], #~ word_seg['pro'], word_seg['enc'])) #~ detailed_result_one = [] #segment the coinjugated noun list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp']) # verify affix compatibility # filter list_seg_conj = self.verify_affix(word_seg['stem_comp'], list_seg_conj, SNC.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word for seg_conj in list_seg_conj: stem_conj = word_seg['stem_comp'][:seg_conj[1]] suffix = word_seg['stem_comp'][seg_conj[1]:] stem_conj = ar.normalize_hamza(stem_conj) stem_conj_list = self.get_stem_variants(stem_conj, suffix) # generate possible stems # add stripped letters to the stem to constitute possible noun list for stem in stem_conj_list: word_seg_l2 = word_seg.copy() # normalize hamza before gessing differents origines word_seg_l2['stem_conj'] = stem word_seg_l2['suffix'] = suffix #affixes tags contains prefixes and suffixes tags word_seg_l2['affix_tags'] = list( set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']] ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[ word_seg_l2['enc']]['tags'] + SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg_l2['suffix']]['tags'])) tmp_list.append(word_seg_l2) if debug: print("after second level") if debug: print(arepr(noun_in)) print(print_table(tmp_list)) # lookup in dictionary if not tmp_list: self.set_error_code(" Second level segmentation error") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # search the noun in the dictionary # we can return the tashkeel inf_noun = word_seg['stem_conj'] # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in #broken plural dictionary if inf_noun in self.cache_dict_search: infnoun_foundlist = self.cache_dict_search[inf_noun] else: infnoun_foundlist = self.lookup_dict(inf_noun) self.cache_dict_search[inf_noun] = infnoun_foundlist for noun_tuple in infnoun_foundlist: word_seg_l3 = word_seg.copy() word_seg_l3["original"] = noun_tuple['vocalized'] word_seg_l3["noun_tuple"] = dict(noun_tuple) tmp_list.append(word_seg_l3) if debug: print("after lookup dict") if debug: print(arepr(noun_in)) noun_tuples = [item['noun_tuple'] for item in tmp_list] print(print_table(noun_tuples)) # test compatiblity noun_tuple with affixes and proaffixes # and generate vocalized affixes and suffixes if not tmp_list: self.set_error_code("Not exists in dictionary") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if self.validate_tags(word_seg['noun_tuple'], word_seg['affix_tags'], word_seg['pro'], word_seg['enc'], word_seg['suffix']): ## get all vocalized form of suffixes for pro_voc in SNC.COMP_PREFIX_LIST_TAGS[ word_seg['pro']]['vocalized']: for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[ word_seg['enc']]['vocalized']: for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suffix']]['vocalized']: ## verify compatibility between proclitics and affix if self.__check_clitic_affix( word_seg['noun_tuple'], pro_voc, enc_voc, suf_voc): # get affix tags affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[pro_voc]['tags']\ +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\ +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags'] word_seg_l4 = word_seg.copy() word_seg_l4['suf_voc'] = suf_voc word_seg_l4['enc_voc'] = enc_voc word_seg_l4['affix_tags'] = affix_tags_voc tmp_list.append(word_seg_l4) if debug: print("after check compatibility") if debug: print(arepr(noun_in)) noun_tuples = [item['noun_tuple'] for item in tmp_list] print(print_table(noun_tuples)) # Generate results if not tmp_list: self.set_error_code("Affixes not compatible") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get voalized and vocalized without inflection #~ vocalized, semi_vocalized, _ = self.vocalize( voca_tuple_list = self.vocalize( word_seg['noun_tuple']['vocalized'], word_seg['pro'], word_seg['suf_voc'], word_seg['enc_voc']) for vocalized, semi_vocalized, _ in voca_tuple_list: #add some tags from dictionary entry as #mamnou3 min sarf and broken plural original_tags = [] if word_seg['noun_tuple']['mankous'] == u"Tk": original_tags.append(u"منقوص") # if there are many cases like feminin plural with mansoub and majrour if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]: list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suf_voc']]['cases'] else: list_cases = ('', ) for case in list_cases: voc_affix_case = word_seg['affix_tags'] + (case, ) # filter empty voc_affix_case = [vac for vac in voc_affix_case if vac] detailed_result.append( wordcase.WordCase({ 'word': noun_in, 'affix': (word_seg['pro'], '', word_seg['suf_voc'], word_seg['enc_voc']), 'stem': word_seg['stem_conj'], 'root': ar.normalize_hamza(word_seg['noun_tuple'].get( 'root', '')), 'original': word_seg['noun_tuple']['vocalized'], #original, 'vocalized': vocalized, 'semivocalized': semi_vocalized, 'tags': u':'.join(voc_affix_case), 'type': u':'.join( ['Noun', word_seg['noun_tuple']['wordtype']]), 'number': word_seg['noun_tuple']['number'], 'gender': word_seg['noun_tuple']['gender'], 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(original_tags), 'syntax': '', })) if not detailed_result: self.set_error_code("Forms are not generated") if debug: print("after generate result") if debug: print(len(detailed_result)) #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape") return detailed_result