def assistanttashkeel(text): """ get tashkeel with suggestions """ cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) vocalized_text = vocalizer.assistanttashkeel(text) return vocalized_text
def tashkeel2(text, lastmark): """ Tashkeel text with suggestions """ cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~ vocalizer.disable_cache() if lastmark == "0" or not lastmark: vocalizer.disable_last_mark() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) return vocalized_dict
def tashkeel_text(text, lastmark=True): """ Tashkeel text without suggestions """ cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~ print "lastMark", lastmark if lastmark == "0": vocalizer.disable_last_mark() vocalized_text = vocalizer.tashkeel(text) return vocalized_text
def test(): args = grabargs() filename = args.filename filename2 = args.compareto # used for comparison if filename2: compare = True else: compare = False outfilename = args.outfile text = args.text if not text and not filename: print('Try: mishkal-console.py -h') sys.exit(0) # tashkeel command command = args.command strip_tashkeel = False reducedTashkeel = False commandTashkeel = False if command == "strip": strip_tashkeel = True elif command == "reduce": reducedTashkeel = True else: commandTashkeel = True # general options limit = args.limit progress = args.progress verbose = args.verbose # options ignore = args.ignore cache = args.cache disableSyntax = args.syntax disableSemantic = args.semantic disableStat = args.stat enable_syn_train = args.train evaluation = args.evaluation # Open file if not text: try: myfile = open(filename, encoding='utf8') print("input file:", filename) if not outfilename: outfilename = filename + ".Tashkeel.txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.strip().split('\n') if compare and filename2: try: myfile2 = open(filename2, encoding='utf8') print("input file2:", filename2) except: print(" Can't Open the given File ", filename2) sys.exit() # all things are well, import library myconsole = tashkeel_console.Tashkeel_console() #~ myconsole.counter = 1 myconsole.limit = limit if not limit: # count lines in files if filename, otherwise count lines in text if filename: with open(filename) as f: limit = sum(1 for line in f) else: limit = len(lines) if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if cache: vocalizer.enable_cache() sys.stderr.write(" Mishkal use a cache") if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # if verbose option, then activate logger in ArabicVocalizer if verbose: vocalizer.enable_verbose() if not text: line = (myfile.readline()) #.decode('utf8') else: if len(lines) > 0: line = lines[0] # get the next line to compare if compare: line_base = myfile2.readline().strip() if evaluation: myconsole.header() while line and myconsole.counter <= limit: line = line.strip() #~ myconsole.lineCorrect = 0 #~ myconsole.lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) elif compare: myconsole.compare(line_base, line) myconsole.display_line_stat() result = line print("base :", line_base) print("input:", line) #~ else: # vocalize line by line elif not evaluation: result = vocalizer.tashkeel(line) myconsole.total += len(araby.tokenize(line)) elif evaluation: inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) myconsole.compare(line, vocalized_dict) # display stat for every line myconsole.display_line_stat() # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) if text: print(result.strip('\n'), end='') else: result_line = result if verbose: print(result_line) # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress: # show progress bar myconsole.progress(compare) myconsole.counter += 1 # get the next line if not text: line = (myfile.readline()) else: if myconsole.counter < len(lines): line = lines[myconsole.counter] else: line = None # get the next line to compare if compare: line_base = myfile2.readline().strip() if progress: myconsole.footer()
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~vocalized_text = vocalizer.tashkeel(text) #~ vocalizer.disable_cache() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [ text1, ] list1 = [] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print(u"\t".join(list1).encode('utf8')) correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print("lists haven't the same length", len(list1), len(list2)) for i in range(min(len(list1), len(list2))): print((u"'%s'\t'%s'" % (list1[i], list2[i].get('chosen', ''))).encode("utf8")) sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct * 100.00 / total, 2) per_incorrect = round(incorrect * 100.00 / total, 2) result = [ displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect) ] return result #correct*100/total