def do_vocalize_param(pagetitle, index, template, param, arabic, latin): def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param, text)) try: vocalized, _ = ar_translit.tr_matching(arabic, latin, True, pagemsg) except Exception as e: pagemsg("Trying to vocalize %s (%s): %s" % (arabic, latin, e)) vocalized = None if vocalized: if vocalized == arabic: pagemsg("No change in %s (Latin %s)" % (arabic, latin)) else: pagemsg("Would replace %s with vocalized %s (Latin %s)" % ( arabic, vocalized, latin)) return vocalized else: pagemsg("Unable to vocalize %s (Latin %s)" % (arabic, latin)) return False
def do_vocalize_param(pagetitle, index, template, param, arabic, latin): def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param, text)) try: vocalized, _ = ar_translit.tr_matching(arabic, latin, True, pagemsg) except Exception as e: pagemsg("Trying to vocalize %s (%s): %s" % (arabic, latin, e)) vocalized = None if vocalized: if vocalized == arabic: pagemsg("No change in %s (Latin %s)" % (arabic, latin)) else: pagemsg("Would replace %s with vocalized %s (Latin %s)" % (arabic, vocalized, latin)) return vocalized else: pagemsg("Unable to vocalize %s (Latin %s)" % (arabic, latin)) return False
def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin, include_tempname_in_changelog=False): actions = [] tname = unicode(template.name) def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text)) if show_template: pagemsg("Processing %s" % (unicode(template))) if include_tempname_in_changelog: paramtrname = "%s.%s" % (tname, paramtr) else: paramtrname = paramtr if latin == "-": pagemsg("Latin is -, taking no action") return False, False, [] # Compute canonarabic and canonlatin match_canon = False canonlatin = "" if latin: try: canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True, msgfun=pagemsg) match_canon = True except Exception as e: pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template))) canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic( latin, arabic, msgfun=pagemsg) else: _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic, msgfun=pagemsg) newlatin = canonlatin == latin and "same" or canonlatin newarabic = canonarabic == arabic and "same" or canonarabic latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or "" try: translit = ar_translit.tr(canonarabic, msgfun=pagemsg) if not translit: pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" % (canonarabic, arabic, unicode(template))) except Exception as e: pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" % (canonarabic, arabic, e, unicode(template))) translit = None show_diff_string = False if canonarabic == arabic: pagemsg("No change in Arabic %s%s" % (arabic, latintrtext)) canonarabic = False else: if match_canon: operation = "Vocalizing" actionop = "vocalize" elif latin: operation = "Cross-canoning" actionop = "cross-canon" show_diff_string = True else: operation = "Self-canoning" actionop = "self-canon" show_diff_string = True if show_diff_string: diffmsg = " (%s)" % diff_string(arabic, canonarabic) else: diffmsg = "" pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic, latintrtext, diffmsg, unicode(template))) if fromparam == toparam: actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic, canonarabic)) else: actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic, toparam, canonarabic)) rdcanonarabic = ar_translit.remove_diacritics(canonarabic) rdarabic = ar_translit.remove_diacritics(arabic) if rdarabic != rdcanonarabic: msgs = [] if " " in rdarabic or rdarabic.startswith( " ") or rdarabic.endswith(" "): msgs.append("stray space") if re.search("[A-Za-z]", nfd_form(rdarabic)): msgs.append("Latin") if u"\u00A0" in rdarabic: msgs.append("NBSP") if re.search(u"[\u200E\u200F]", rdarabic): msgs.append("L2R/R2L") if u"ی" in rdarabic: msgs.append("Farsi Yeh") if u"ک" in rdarabic: msgs.append("Keheh") if re.search(u"[\uFB50-\uFDCF]", rdarabic): msgs.append("Arabic Pres-A") if re.search(u"[\uFDF0-\uFDFF]", rdarabic): msgs.append("Arabic word ligatures") if re.search(u"[\uFE70-\uFEFF]", rdarabic): msgs.append("Arabic Pres-B") diffmsg = diff_string(rdarabic, rdcanonarabic) pagemsg( "NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s" % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "", diffmsg, unicode(template))) if not latin: pass elif translit and ( translit == canonlatin # or translit == canonlatin + "un" or # translit == u"ʾ" + canonlatin or # translit == u"ʾ" + canonlatin + "un" ): pagemsg("Removing redundant translit for %s -> %s%s" % (arabic, newarabic, latintrtext)) actions.append("remove redundant %s=%s" % (paramtrname, latin)) canonlatin = True else: if match_canon: operation = "Match-canoning" passive = "Match-canoned" actionop = "match-canon" else: operation = "Cross-canoning" passive = "Cross-canoned" actionop = "cross-canon" if translit: pagemsg( "NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" % (passive, canonlatin, translit, unicode(template))) if canonlatin == latin: pagemsg( "No change in Latin %s: Arabic %s -> %s (auto-translit %s)" % (latin, arabic, newarabic, translit)) canonlatin = False else: pagemsg( "%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % (operation, latin, canonlatin, arabic, newarabic, translit, unicode(template))) actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin, canonlatin)) return (canonarabic, canonlatin, actions)
def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin, include_tempname_in_changelog=False): actions = [] tname = unicode(template.name) def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text)) if show_template: pagemsg("Processing %s" % (unicode(template))) if include_tempname_in_changelog: paramtrname = "%s.%s" % (tname, paramtr) else: paramtrname = paramtr if latin == "-": pagemsg("Latin is -, taking no action") return False, False, [] # Compute canonarabic and canonlatin match_canon = False canonlatin = "" if latin: try: canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True, msgfun=pagemsg) match_canon = True except Exception as e: pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template))) canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic(latin, arabic, msgfun=pagemsg) else: _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic, msgfun=pagemsg) newlatin = canonlatin == latin and "same" or canonlatin newarabic = canonarabic == arabic and "same" or canonarabic latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or "" try: translit = ar_translit.tr(canonarabic, msgfun=pagemsg) if not translit: pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" % (canonarabic, arabic, unicode(template))) except Exception as e: pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" % (canonarabic, arabic, e, unicode(template))) translit = None show_diff_string = False if canonarabic == arabic: pagemsg("No change in Arabic %s%s" % (arabic, latintrtext)) canonarabic = False else: if match_canon: operation="Vocalizing" actionop="vocalize" elif latin: operation="Cross-canoning" actionop="cross-canon" show_diff_string = True else: operation="Self-canoning" actionop="self-canon" show_diff_string = True if show_diff_string: diffmsg = " (%s)" % diff_string(arabic, canonarabic) else: diffmsg = "" pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic, latintrtext, diffmsg, unicode(template))) if fromparam == toparam: actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic, canonarabic)) else: actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic, toparam, canonarabic)) rdcanonarabic = ar_translit.remove_diacritics(canonarabic) rdarabic = ar_translit.remove_diacritics(arabic) if rdarabic != rdcanonarabic: msgs = [] if " " in rdarabic or rdarabic.startswith(" ") or rdarabic.endswith(" "): msgs.append("stray space") if re.search("[A-Za-z]", nfd_form(rdarabic)): msgs.append("Latin") if u"\u00A0" in rdarabic: msgs.append("NBSP") if re.search(u"[\u200E\u200F]", rdarabic): msgs.append("L2R/R2L") if u"ی" in rdarabic: msgs.append("Farsi Yeh") if u"ک" in rdarabic: msgs.append("Keheh") if re.search(u"[\uFB50-\uFDCF]", rdarabic): msgs.append("Arabic Pres-A") if re.search(u"[\uFDF0-\uFDFF]", rdarabic): msgs.append("Arabic word ligatures") if re.search(u"[\uFE70-\uFEFF]", rdarabic): msgs.append("Arabic Pres-B") diffmsg = diff_string(rdarabic, rdcanonarabic) pagemsg("NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s" % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "", diffmsg, unicode(template))) if not latin: pass elif translit and (translit == canonlatin # or translit == canonlatin + "un" or # translit == u"ʾ" + canonlatin or # translit == u"ʾ" + canonlatin + "un" ): pagemsg("Removing redundant translit for %s -> %s%s" % ( arabic, newarabic, latintrtext)) actions.append("remove redundant %s=%s" % (paramtrname, latin)) canonlatin = True else: if match_canon: operation="Match-canoning" passive="Match-canoned" actionop="match-canon" else: operation="Cross-canoning" passive="Cross-canoned" actionop="cross-canon" if translit: pagemsg("NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" % (passive, canonlatin, translit, unicode(template))) if canonlatin == latin: pagemsg("No change in Latin %s: Arabic %s -> %s (auto-translit %s)" % (latin, arabic, newarabic, translit)) canonlatin = False else: pagemsg("%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % ( operation, latin, canonlatin, arabic, newarabic, translit, unicode(template))) actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin, canonlatin)) return (canonarabic, canonlatin, actions)