def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name in fr_head_templates: rmparam(t, "sort") newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append("remove sort= from {{%s}}" % name) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): if unicode(t.name) == "ru-IPA": origt = unicode(t) if getparam(t, "phon"): pagemsg("phon= already present: %s" % unicode(t)) else: phon = getparam(t, "1") pagemsg("Adding phon=: %s" % unicode(t)) rmparam(t, "1") t.add("phon", phon) pagemsg("Replaced %s with %s" % (origt, unicode(t))) newtext = unicode(parsed) if newtext != text: if verbose: pagemsg("Replacing <<%s>> with <<%s>>" % (text, newtext)) comment = "Add phon= to ru-IPA templates" if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) else: pagemsg("Skipping")
def fix_cite_book_params(t): origt = unicode(t) if getparam(t, "origyear").strip() and getparam(t, "year").strip(): if getparam(t, "year_published"): pagemsg("WARNING: Would set year_published= but is already present: %s" % unicode(t)) else: rmparam(t, "year_published") # in case of blank param t.get("year").name = "year_published" t.get("origyear").name = "year" pagemsg("year -> year_published, origyear -> year") move_param(t, "origdate", "date") move_param(t, "origmonth", "month") def frob_isbn(idval): isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)" if re.search(isbn_re, idval, re.I): return re.sub(isbn_re, r"\1", idval, 0, re.I) elif re.search(r"^[0-9]", idval.strip()): return idval else: pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" % idval.replace("\n", r"\n")) return None move_param(t, "id", "isbn", frob_isbn) fix_page_params(t) return origt != unicode(t)
def fix_quote_usenet_params(t): origt = unicode(t) monthday = getparam(t, "monthday").strip() year = getparam(t, "year").strip() if monthday and year: if getparam(t, "date"): pagemsg("WARNING: Would set date= but is already present: %s" % unicode(t)) else: rmparam(t, "date") # in case of blank param param = t.get("monthday") param.name = "date" if re.search("^[0-9]+/[0-9]+$", monthday): param.value = "%s/%s" % (monthday, year) else: param.value = "%s %s" % (monthday, year) rmparam(t, "year") pagemsg("monthday/year -> date") move_param(t, "group", "newsgroup") move_param(t, "text", "passage") move_param(t, "6", "passage") move_param(t, "5", "url") move_param(t, "4", "newsgroup") move_param(t, "3", "title") move_param(t, "2", "author") move_param(t, "1", "date") return origt != unicode(t)
def process_page(templates, index, page, save=False, verbose=False): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not page.exists(): pagemsg("WARNING: Page doesn't exist") return parsed = blib.parse(page) should_save = False for t in parsed.filter_templates(): if unicode(t.name) in templates: origt = unicode(t) # Punt if multi-arg-set, can't handle yet should_continue = False for param in t.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg("WARNING: Can't handle multi-decl templates: %s" % unicode(t)) should_continue = True break if val == "-" or val == "_" or val.startswith("join:"): pagemsg("WARNING: Can't handle multi-word templates: %s" % unicode(t)) should_continue = True break if should_continue: continue if arg1_is_stress(getparam(t, "1")): oldplarg = "5" newplarg = "4" else: oldplarg = "4" newplarg = "3" plstem = getparam(t, oldplarg) if plstem: if getparam(t, newplarg): pagemsg("WARNING: Something wrong, found args in both positions %s and %s: %s" % (newplarg, oldplarg, unicode(t))) continue rmparam(t, oldplarg) t.add(newplarg, plstem) should_save = True pagemsg("Replacing %s with %s" % (origt, unicode(t))) if should_save: comment = "Move plstem from 5th/4th argument to 4th/3rd" if save: pagemsg("Saving with comment = %s" % comment) page.text = unicode(parsed) page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) param2 = getparam(t, "2") if unicode(t.name) in ["ru-conj"] and re.search(r"^8[ab]", param2): if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue past_m = getparam(t, "past_m") if past_m: rmparam(t, "past_m") stem = getparam(t, "3") if stem == past_m: pagemsg("Stem %s and past_m same" % stem) notes.append("remove redundant past_m %s" % past_m) elif (param2.startswith("8b") and not param2.startswith("8b/") and ru.make_unstressed(past_m) == stem): pagemsg("Class 8b/b and stem %s is unstressed version of past_m %s, replacing stem with past_m" % ( stem, past_m)) t.add("3", past_m) notes.append("moving past_m %s to arg 3" % past_m) else: pagemsg("Stem %s and past_m %s are different, putting past_m in param 5" % ( stem, past_m)) t.add("5", past_m) notes.append("moving past_m %s to arg 5" % past_m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) param1 = getparam(t, "1") if unicode(t.name) in ["ru-conj"]: if re.search(r"^6[ac]", param1): if getparam(t, "no_iotation"): rmparam(t, "no_iotation") if param1.startswith("6a"): notes.append(u"6a + no_iotation -> 6°a") else: notes.append(u"6c + no_iotation -> 6°c") t.add("1", re.sub("^6", u"6°", param1)) elif re.search(r"^6b", param1): notes.append(u"6b -> 6°b") t.add("1", re.sub("^6", u"6°", param1)) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-phrase": if t.has("tr"): pagemsg("WARNING: Has tr=: %s" % unicode(t)) if t.has("head"): if t.has("1"): pagemsg("WARNING: Has both head= and 1=: %s" % unicode(t)) else: notes.append("ru-phrase: convert head= to 1=") origt = unicode(t) head = getparam(t, "head") rmparam(t, "head") tr = getparam(t, "tr") rmparam(t, "tr") t.add("1", head) if tr: t.add("tr", tr) pagemsg("Replacing %s with %s" % (origt, unicode(t))) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Remove gender from adjective forms parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form": origt = unicode(t) rmparam(t, "g") rmparam(t, "g2") rmparam(t, "g3") rmparam(t, "g4") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("remove gender from adjective forms") sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, genders): pagetitle = unicode(page.title()) subpagetitle = re.sub(".*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse(page) headword_template = None for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: if headword_template: pagemsg("WARNING: Multiple headword templates, skipping") return headword_template = t if not headword_template: pagemsg("WARNING: No headword templates, skipping") return orig_template = unicode(headword_template) rmparam(headword_template, "g") rmparam(headword_template, "g2") rmparam(headword_template, "g3") rmparam(headword_template, "g4") rmparam(headword_template, "g5") for gnum, g in enumerate(genders): param = "g" if gnum == 0 else "g" + str(gnum+1) headword_template.add(param, g) pagemsg("Replacing %s with %s" % (orig_template, unicode(headword_template))) comment = "Fix headword gender, substituting new value %s" % ",".join(genders) if save: pagemsg("Saving with comment = %s" % comment) page.text = unicode(parsed) page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if not pagetitle.endswith(u"ся"): return text = unicode(page.text) notes = [] parsed = blib.parse(page) for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-decl-adj", "ru-adj-old"] and getparam(t, "suffix") == u"ся": lemma = getparam(t, "1") lemma = re.sub(",", u"ся,", lemma) lemma = re.sub("$", u"ся", lemma) t.add("1", lemma) rmparam(t, "suffix") notes.append(u"move suffix=ся to lemma") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj-4a"]: shch = getparam(t, "4") if shch == u"щ": t.add("3", getparam(t, "3") + shch) rmparam(t, "4") notes.append(u"move param 4 (щ) to param 3") elif shch: pagemsg("WARNING: Strange value %s for param 4" % shch) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def move_param(t, fr, to, frob_from=None): if t.has(fr): oldval = getparam(t, fr) if not oldval.strip(): rmparam(t, fr) pagemsg("Removing blank param %s" % fr) return if frob_from: newval = frob_from(oldval) if not newval or not newval.strip(): return else: newval = oldval if getparam(t, to).strip(): pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s" % (fr, to, to, unicode(t))) elif oldval != newval: rmparam(t, to) # in case of blank param # If either old or new name is a number, use remove/add to automatically set the # showkey value properly; else it's safe to just change the name of the param, # which will preserve its location. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: tfr = t.get(fr) tfr.name = to tfr.value = newval pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to, newval.replace("\n", r"\n"))) else: rmparam(t, to) # in case of blank param # See comment above. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: t.get(fr).name = to pagemsg("%s -> %s" % (fr, to))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-ux": origt = unicode(t) if t.has("adj"): pagemsg("Removing adj=") notes.append("remove adj= from ru-ux") rmparam(t, "adj") if t.has("shto"): pagemsg("Removing shto=") notes.append("remove shto= from ru-ux") rmparam(t, "shto") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj-5c", "ru-conj-6b"]: past_f = getparam(t, "4") if past_f: t.add("past_f", past_f, before="4") rmparam(t, "4") notes.append("Replace 4= with past_f=") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj"]: conjtype = getparam(t, "1") if conjtype.startswith("6a"): param6 = getparam(t, "6") if param6: rmparam(t, "6") if not getparam(t, "5"): rmparam(t, "5") for i in xrange(1, 4): if not t.has(str(i)): t.add(str(i), "") t.add("4", param6) notes.append("move type 6a arg6 -> arg4") if conjtype.startswith("7b"): param7 = getparam(t, "7") if param7: rmparam(t, "7") for i in xrange(1, 6): if not t.has(str(i)): t.add(str(i), "") t.add("6", param7) notes.append("move type 7b arg7 -> arg6") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_text_on_page(index, pagename, text, verbs): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if args.mode == "full-conj": if pagename not in verbs: pagemsg("WARNING: Couldn't find entry for pagename") return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "en" and getparam( t, "2") == "verb": if getparam(t, "3"): pagemsg("WARNING: Already has 3=, not touching: %s" % unicode(t)) continue blib.set_template_name(t, "en-verb") t.add("1", verbs[pagename]) rmparam(t, "2") notes.append( "convert {{head|en|verb}} of multiword expression to {{en-verb}}" ) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: first, rest = pagename.split(" ", 1) if first not in verbs: pagemsg("WARNING: Couldn't find entry for first=%s" % first) return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "en" and getparam( t, "2") == "verb": if getparam(t, "3"): pagemsg("WARNING: Already has 3=, not touching: %s" % unicode(t)) continue blib.set_template_name(t, "en-verb") done = False words = pagename.split(" ") plural = False for word in words: if singularizable(word): plural = True break if plural: if verbs[first].startswith("<"): restwords = [] for word in words[1:]: restwords.append(link(word)) param1 = "[[%s]]%s %s" % (first, verbs[first], " ".join(restwords)) head_from_param = re.sub("<.*?>", "", param1) existing_head = getparam(t, "head") canon_existing_head = canonicalize_existing_linked_head( existing_head, pagemsg) if canon_existing_head == head_from_param: pagemsg("Removing existing head %s" % existing_head) rmparam(t, "head") t.add("1", param1) done = True elif canon_existing_head != existing_head: pagemsg( "Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head)) t.add("head", canon_existing_head) pagemsg( "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" % (canon_existing_head, head_from_param, origt)) elif existing_head: pagemsg( "WARNING: Existing head not removed (different from head-from-param %s): %s" % (head_from_param, origt)) else: t.add("1", param1) done = True else: t.add("1", verbs[first]) headwords = [] for word in words: if not headwords: # first word headwords.append("[[" + word + "]]") else: headwords.append(link(word)) head_from_param = " ".join(headwords) existing_head = getparam(t, "head") canon_existing_head = canonicalize_existing_linked_head( existing_head, pagemsg) if canon_existing_head == head_from_param: pagemsg("Removing existing head %s" % existing_head) rmparam(t, "head") elif canon_existing_head != existing_head: pagemsg( "Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head)) t.add("head", canon_existing_head) pagemsg( "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" % (canon_existing_head, head_from_param, origt)) elif existing_head: pagemsg( "WARNING: Existing head not removed (different from head-from-param %s): %s" % (head_from_param, origt)) else: t.add("head", head_from_param) done = True if not done: existing_head = getparam(t, "head") if existing_head: head_from_param = " ".join( "[[%s]]" % word if word != "the" else word for word in pagename.split(" ")) canon_existing_head = canonicalize_existing_linked_head( existing_head, pagemsg) if canon_existing_head == head_from_param: pagemsg("Removing existing head %s" % existing_head) rmparam(t, "head") elif canon_existing_head != existing_head: pagemsg( "Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head)) t.add("head", canon_existing_head) pagemsg( "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" % (canon_existing_head, head_from_param, origt)) else: pagemsg( "WARNING: Existing head not removed (different from head-from-param %s): %s" % (head_from_param, origt)) if verbs[first].startswith("<"): t.add("1", "%s%s %s" % (first, verbs[first], rest)) else: t.add("1", verbs[first]) rmparam(t, "2") notes.append( "convert {{head|en|verb}} of multiword expression to {{en-verb}}" ) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_page(index, page, direc, delete_bad, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] direc = direc.replace("3oa", u"3°a") for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj"]: conjtype = getparam(t, "1") if not conjtype.startswith("3olda"): continue if conjtype.startswith("3olda") and conjtype != "3olda": pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t)) continue tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue oldargs = blib.split_generate_args(result) rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") t.add("1", direc) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue if delete_bad: newargs = blib.split_generate_args(result) for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short", "past_f_short", "past_n_short", "past_pl_short"]: oldforms = re.split(",", oldargs[form]) if form in oldargs else [] newforms = re.split(",", newargs[form]) if form in newargs else [] for oldform in oldforms: if oldform not in newforms: formpagename = rulib.remove_accents(oldform) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) elif "----" in text: pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename) else: numinfls = len(re.findall(r"\{\{inflection of\|", text)) if numinfls < 1: pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename) elif numinfls > 1: pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename) else: comment = "Delete erroneously created long form of %s" % pagetitle pagemsg("Existing text for form %s: [[%s]]" % ( formpagename, text)) if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment)) notes.append("fix 3olda -> %s" % direc) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def rewrite_one_page_arz_headword(page, index, text): temps_changed = [] for t in text.filter_templates(): if unicode(t.name) == "arz-noun": head = getparam(t, "head") rmparam(t, "head") tr = getparam(t, "tr") rmparam(t, "tr") sort = getparam(t, "sort") rmparam(t, "sort") g = getparam(t, "g") rmparam(t, "g") g2 = getparam(t, "g2") rmparam(t, "g2") pl = getparam(t, "2") rmparam(t, "2") pltr = getparam(t, "3") rmparam(t, "3") addparam(t, "1", head) addparam(t, "2", g) if g2: addparam(t, "g2", g2) if tr: addparam(t, "tr", tr) if pl: addparam(t, "pl", pl) if pltr: addparam(t, "pltr", pltr) if sort: addparam(t, "sort", sort) temps_changed.append("arz-noun") elif unicode(t.name) == "arz-adj": head = getparam(t, "head") rmparam(t, "head") tr = getparam(t, "tr") rmparam(t, "tr") sort = getparam(t, "sort") rmparam(t, "sort") pl = getparam(t, "pwv") or getparam(t, "p") rmparam(t, "pwv") rmparam(t, "p") pltr = getparam(t, "ptr") rmparam(t, "ptr") f = getparam(t, "fwv") or getparam(t, "f") rmparam(t, "fwv") rmparam(t, "f") ftr = getparam(t, "ftr") rmparam(t, "ftr") addparam(t, "1", head) if tr: addparam(t, "tr", tr) if f: addparam(t, "f", f) if ftr: addparam(t, "ftr", ftr) if pl: addparam(t, "pl", pl) if pltr: addparam(t, "pltr", pltr) if sort: addparam(t, "sort", sort) temps_changed.append("arz-adj") return text, "rewrite %s to new style" % ", ".join(temps_changed)
def process_page(page, index, do_fix): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-conj", "ru-conj-old"]: if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") if tname == "ru-conj": tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) else: tempcall = re.sub(r"\{\{ru-conj-old", "{{ru-generate-verb-forms|old=y", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) for base in ["past_pasv_part", "ppp"]: forms_to_remove = [] if args[base] == "-": continue for form in re.split(",", args[base]): origform = form form = re.sub("//.*", "", form) fix_form = False if not re.search(ur"([аяеё]́?нный|тый)$", form): pagemsg( "WARNING: Past passive participle doesn't end correctly: %s" % form) fix_form = True unstressed_page = rulib.make_unstressed_ru(pagetitle) unstressed_form = rulib.make_unstressed_ru(form) warned = False if unstressed_form[0] != unstressed_page[0]: pagemsg( "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s" % form) warned = True fix_form = True if form.endswith(u"нный"): if pagetitle.endswith(u"ать"): good_ending = u"анный" elif pagetitle.endswith(u"ять"): good_ending = u"янный" else: good_ending = u"енный" if not unstressed_form.endswith(good_ending): pagemsg( "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s" % form) warned = True fix_form = True if not warned: correct_form = form_ppp(conjtype, pagetitle, args) if correct_form and unstressed_form != correct_form: pagemsg( "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s" % (unstressed_form, correct_form)) fix_form = True if fix_form: forms_to_remove.append(origform) if forms_to_remove and do_fix: curvals = [] for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]: val = getparam(t, base + i) if val: curvals.append(val) newvals = [x for x in curvals if x not in forms_to_remove] if len(curvals) - len(newvals) != len(forms_to_remove): pagemsg( "WARNING: Something wrong, couldn't remove all PPP forms %s" % ",".join(forms_to_remove)) curindex = 1 origt = unicode(t) for newval in newvals: t.add(base + ("" if curindex == 1 else str(curindex)), newval) curindex += 1 for i in xrange(curindex, 10): rmparam(t, base + ("" if i == 1 else str(i))) pagemsg("Replacing %s with %s" % (origt, unicode(t))) notes.append("removed bad past pasv part(s) %s" % ",".join(forms_to_remove))
def process_page(page, index, parsed): global args verbose = args.verbose pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) parsed = blib.parse(page) headword_template = None see_template = None for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: if headword_template: pagemsg("WARNING: Multiple headword templates, skipping") return headword_template = t if unicode(t.name) in ["ru-decl-noun-see"]: if see_template: pagemsg( "WARNING: Multiple ru-decl-noun-see templates, skipping") return see_template = t if not headword_template: pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping") return if not see_template: pagemsg("WARNING: No ru-decl-noun-see templates, skipping") return del see_template.params[:] for param in headword_template.params: see_template.add(param.name, param.value) see_template.name = "ru-noun-table" if unicode(headword_template.name) == "ru-proper noun+": # Things are trickier for proper nouns because they default to n=sg, whereas # ru-noun-table defaults to n=both. We have to expand both templates and # fetch the value of n, and set it in ru-noun-table if not the same. # 1. Generate args for headword proper-noun template, using |ndef=sg # because ru-proper noun+ defaults to sg and ru-generate-noun-args # would otherwise default to both. headword_generate_template = re.sub(r"^\{\{ru-proper noun\+", "{{ru-generate-noun-args", unicode(headword_template)) headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}", headword_generate_template) headword_generate_result = expand_text(headword_generate_template) if not headword_generate_result: pagemsg("WARNING: Error generating ru-proper noun+ args") return None # 2. Fetch actual value of n. headword_args = blib.split_generate_args(headword_generate_result) headword_n = headword_args["n"] # 3. If sg, we always need to set n=sg explicitly in ru-noun-table. if headword_n == "s": see_template.add("n", "sg") # 4. If pl, leave alone, since both will default to plural only if the # lemma is pl, else n=pl needs to be set for both. elif headword_n == "p": pass # 5. If both, n=both had to have been set explicitly in the headword, # but it's the default in ru-noun-table unless the lemma is plural. # So remove n=both, generate the arguments, and see if the actual # value of args.n is b (for "both"); if not, set n=both. else: assert headword_n == "b" rmparam(see_template, "n") see_generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(see_template)) see_generate_result = expand_text(see_generate_template) if not see_generate_result: pagemsg("WARNING: Error generating ru-noun-table args") return None see_args = blib.split_generate_args(see_generate_result) if see_args["n"] != "b": see_template.add("n", "both") return unicode( parsed ), "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode( headword_template.name)
def process_page(index, page, direc, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] origdirec = direc for t in parsed.filter_templates(): origt = unicode(t) direc = origdirec if unicode(t.name) in ["ru-conj-7b"]: rmparam(t, "past_m") rmparam(t, "past_f") rmparam(t, "past_n") rmparam(t, "past_pl") rmparam(t, "notes") rmparam(t, "past_adv_part") rmparam(t, "past_adv_part2") rmparam(t, "past_adv_part_short") #ppps = blib.fetch_param_chain(t, "past_pasv_part", "past_pasv_part") #blib.remove_param_chain(t, "past_pasv_part", "past_pasv_part") presstem = getparam(t, "3") rmparam(t, "5") rmparam(t, "4") rmparam(t, "3") npp = "npp" in direc direc = direc.replace("npp", "") yo = u"ё" in direc direc = direc.replace(u"ё", "") direc = re.sub("7b/?", "", direc) if re.search(u"е́?[^аэыоуяеиёю]*$", presstem): if not yo: pagemsg(u"Something wrong, е-stem present and no ё directive") if npp: presstem = ru.make_ending_stressed(presstem) else: presstem = re.sub(u"е́?([^аэыоуяеиёю]*)$", ur"ё\1", presstem) else: presstem = ru.make_ending_stressed(presstem) pap = getparam(t, "past_actv_part") pred_pap = presstem + u"ший" if direc not in ["b", "b(9)"] and re.search(u"[дт]$", presstem): pred_pap = re.sub(u"[дт]$", "", presstem) + u"вший" if pap: if pap == pred_pap: pagemsg("Removing past_actv_part=%s because same as predicted" % pap) rmparam(t, "past_actv_part") else: pagemsg("Not removing unpredictable past_actv_part=%s (predicted %s)" % (pap, pred_pap)) for param in t.params: if not re.search("^([0-9]+$|past_pasv_part)", unicode(param.name)): pagemsg("Found additional named param %s" % unicode(param)) t.add("3", presstem) if direc: t.add("4", "") t.add("5", direc) blib.sort_params(t) #blib.set_param_chain(t, ppps, "past_pasv_part", "past_pasv_part") notes.append("set class-7b verb to directive %s%s" % (direc, npp and u" (no ё in present stem)" or "")) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) def getp(param): return getparam(t, param) if tn == "pl-decl-adj-ki": param1 = getp("1") param2 = getp("2") blib.set_template_name(t, "pl-decl-adj-auto") rmparam(t, "2") rmparam(t, "1") if ":" in pagetitle and pagetitle != param1 + "ki": pagemsg( "WARNING: Param 1=%s doesn't agree with pagetitle: %s" % (param1, origt)) t.add("1", param1 + "ki") if param2: t.add("olddat", param2) notes.append("Convert {{pl-decl-adj-ki}} to {{pl-decl-adj-auto}}") elif tn in ["pl-decl-adj-y", "pl-adj-y"]: if getp("head"): pagemsg("WARNING: Saw head=, not changing: %s" % origt) else: param1 = getp("1") blib.set_template_name(t, "pl-decl-adj-auto") rmparam(t, "2") rmparam(t, "1") if ":" in pagetitle and pagetitle != param1 + "y": pagemsg( "WARNING: Param 1=%s doesn't agree with pagetitle: %s" % (param1, origt)) t.add("1", param1 + "y") notes.append("Convert {{%s}} to {{pl-decl-adj-auto}}" % tn) elif tn == "pl-decl-adj-i": param1 = getp("1") param2 = getp("2") blib.set_template_name(t, "pl-decl-adj-auto") rmparam(t, "2") rmparam(t, "1") if param1: if param2 in ["g", "gi"]: should_pagetitle = param1 + "gi" elif param2 in ["l", "li"]: should_pagetitle = param1 + "li" else: should_pagetitle = param1 + "i" if ":" in pagetitle and pagetitle != should_pagetitle: pagemsg( "WARNING: Param 1=%s doesn't agree with pagetitle (pagetitle should be %s): %s" % (param1, should_pagetitle, origt)) t.add("1", should_pagetitle) notes.append("Convert {{pl-decl-adj-i}} to {{pl-decl-adj-auto}}") elif tn == "pl-decl-adj-owy": param1 = getp("1") blib.set_template_name(t, "pl-decl-adj-auto") rmparam(t, "2") rmparam(t, "1") if ":" in pagetitle and pagetitle != param1 + "owy": pagemsg( "WARNING: Param 1=%s doesn't agree with pagetitle: %s" % (param1, origt)) t.add("1", param1 + "owy") notes.append("Convert {{pl-decl-adj-owy}} to {{pl-decl-adj-auto}}") return unicode(parsed), notes
def process_page(index, page, direc): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] origdirec = direc for t in parsed.filter_templates(): origt = unicode(t) direc = origdirec if unicode(t.name) in ["ru-conj-7b"]: rmparam(t, "past_m") rmparam(t, "past_f") rmparam(t, "past_n") rmparam(t, "past_pl") rmparam(t, "notes") rmparam(t, "past_adv_part") rmparam(t, "past_adv_part2") rmparam(t, "past_adv_part_short") #ppps = blib.fetch_param_chain(t, "past_pasv_part", "past_pasv_part") #blib.remove_param_chain(t, "past_pasv_part", "past_pasv_part") presstem = getparam(t, "3") rmparam(t, "5") rmparam(t, "4") rmparam(t, "3") npp = "npp" in direc direc = direc.replace("npp", "") yo = u"ё" in direc direc = direc.replace(u"ё", "") direc = re.sub("7b/?", "", direc) if re.search(u"е́?[^аэыоуяеиёю]*$", presstem): if not yo: pagemsg(u"Something wrong, е-stem present and no ё directive") if npp: presstem = rulib.make_ending_stressed_ru(presstem) else: presstem = re.sub(u"е́?([^аэыоуяеиёю]*)$", ur"ё\1", presstem) else: presstem = rulib.make_ending_stressed_ru(presstem) pap = getparam(t, "past_actv_part") pred_pap = presstem + u"ший" if direc not in ["b", "b(9)"] and re.search(u"[дт]$", presstem): pred_pap = re.sub(u"[дт]$", "", presstem) + u"вший" if pap: if pap == pred_pap: pagemsg("Removing past_actv_part=%s because same as predicted" % pap) rmparam(t, "past_actv_part") else: pagemsg("Not removing unpredictable past_actv_part=%s (predicted %s)" % (pap, pred_pap)) for param in t.params: if not re.search("^([0-9]+$|past_pasv_part)", unicode(param.name)): pagemsg("Found additional named param %s" % unicode(param)) t.add("3", presstem) if direc: t.add("4", "") t.add("5", direc) blib.sort_params(t) #blib.set_param_chain(t, ppps, "past_pasv_part", "past_pasv_part") notes.append("set class-7b verb to directive %s%s" % (direc, npp and u" (no ё in present stem)" or "")) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) allow_2 = False lemma = None if tn in ["la-future participle", "la-perfect participle", "la-gerundive"]: base = getparam(t, "1") if tn == "la-gerundive": param2 = getparam(t, "2") if param2: if lalib.remove_macrons(base) == lalib.remove_macrons(param2): allow_2 = True base = param2 else: pagemsg("WARNING: Unrecognized param 2: %s" % origt) continue if not base: pagemsg("WARNING: Empty param 1: %s" % origt) continue lemma = base + "us" elif tn == "la-present participle": base = getparam(t, "1") ending = getparam(t, "2") if not base: pagemsg("WARNING: Empty param 1: %s" % origt) continue if not ending: pagemsg("WARNING: Empty param 2: %s" % origt) continue if ending == "ans": lemma = base + u"āns" elif ending == "ens": lemma = base + u"ēns" elif ending == "iens": lemma = u"%siēns/%seunt" % (base, base) else: pagemsg("WARNING: Unrecognized param 2: %s" % origt) continue allow_2 = True if lemma: bad_param = False for param in t.params: pname = unicode(param.name) if pname.strip() == "1" or allow_2 and pname.strip() == "2": continue pagemsg("WARNING: Unrecognized param %s=%s: %s" % ( pname, param.value, origt)) bad_param = True if bad_param: continue rmparam(t, "2") t.add("1", lemma) blib.set_template_name(t, "la-part") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append(u"convert {{%s}} to {{la-part}}" % tn) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in ["place:Brazil/municipality", "place:municipality of Brazil"]: state = getparam(t, "state") trans = getparam(t, "2") blib.set_template_name(t, "place") rmparam(t, "state") t.add("2", "municipality") t.add("3", "s/%s" % state) t.add("4", "c/Brazil") if trans: t.add("t", trans) if tn in ["place:Brazil/state", "place:state of Brazil"]: region = getparam(t, "region") capital = getparam(t, "capital") trans = getparam(t, "2") blib.set_template_name(t, "place") rmparam(t, "region") rmparam(t, "capital") t.add("2", "state") t.add("3", "r/%s" % region) t.add("4", "c/Brazil") t.add("capital", capital) if trans: t.add("t", trans) if tn in [ "place:Brazil/state capital", "place:state capital of Brazil" ]: state = getparam(t, "state") trans = getparam(t, "2") blib.set_template_name(t, "place") rmparam(t, "state") t.add("2", "municipality/state capital") t.add("3", "s/%s" % state) t.add("4", "c/Brazil") if trans: t.add("t", trans) if tn in ["place:Brazil/capital", "place:capital of Brazil"]: trans = getparam(t, "2") blib.set_template_name(t, "place") t.add("2", "municipality/capital city") t.add("3", "c/Brazil") t.add("4", ";") t.add("5", "state capital") t.add("6", "s/Distrito Federal") t.add("7", "c/Brazil") if trans: t.add("t", trans) newt = unicode(t) if origt != newt: notes.append("replace {{%s}} with {{place}}" % tn) pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_text_on_page(pagetitle, index, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] subsections = re.split("(^==+[^=\n]+==+\n)", text, 0, re.M) for j in xrange(2, len(subsections), 2): if not re.search("==(Adjective|Numeral|Ordinal Numeral|Participle)==", subsections[j - 1]): continue parsed = blib.parse_text(subsections[j]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in rename_templates_without_lang: lemma = getparam(t, "1") langparam = None lemmaparam = "1" elif tn in rename_templates_with_lang and t.has( "lang") and getparam(t, "lang") == "lb": lemma = getparam(t, "1") langparam = "lang" lemmaparam = "1" elif tn in rename_templates_with_lang and not t.has( "lang") and getparam(t, "1") == "lb": lemma = getparam(t, "2") langparam = "1" lemmaparam = "2" else: continue lemmas_to_try = [lemma] if lemma.endswith("e"): # lemma with a schwa lemmas_to_try.append(lemma[:-1]) if lemma == "gutt": lemmas_to_try.append("gudd") ending_sets_to_try = [positive_ending_tags] endings_to_try = [] for ending_sets in ending_sets_to_try: for ending, tag_sets in ending_sets.iteritems(): if pagetitle.endswith(ending): endings_to_try.append((ending, tag_sets)) if len(endings_to_try) == 0: pagemsg( "WARNING: Can't identify ending of non-lemma form, skipping" ) continue found_combinations = [] for ending_to_try, tag_sets in endings_to_try: for lemma_to_try in lemmas_to_try: if lemma_to_try + ending_to_try == pagetitle: found_combinations.append( (lemma_to_try, ending_to_try, tag_sets)) if len(found_combinations) == 0: pagemsg( "WARNING: Can't match lemma %s with page title (tried lemma variants %s and endings %s), skipping" % (lemma, "/".join(lemmas_to_try), "/".join( ending_to_try for ending_to_try, tag_sets in endings_to_try))) continue if len(found_combinations) > 1: pagemsg( "WARNING: Found multiple possible matching endings for lemma %s (found possibilities %s), skipping" % (lemma, "/".join("%s+%s" % (lemmas_to_try, endings_to_try) for lemma_to_try, ending_to_try, tag_sets in found_combinations))) continue lemma_to_try, ending_to_try, tag_sets = found_combinations[0] # Erase all params. if langparam: rmparam(t, langparam) elif getparam(t, "lang") == "lb": # Sometimes |lang=lb redundantly occurs; remove it if so rmparam(t, "lang") rmparam(t, lemmaparam) tr = getparam(t, "tr") rmparam(t, "tr") if len(t.params) > 0: pagemsg( "WARNING: Original template %s has extra params, skipping" % origt) return None, None # Set new name blib.set_template_name(t, "inflection of") # Put back new params. t.add("1", "lb") t.add("2", lemma) if tr: t.add("tr", tr) t.add("3", "") nextparam = 4 for tag in "|;|".join(tag_sets).split("|"): t.add(str(nextparam), tag) nextparam += 1 notes.append("replace %s with %s" % (origt, unicode(t))) pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) subsections[j] = unicode(parsed) text = "".join(subsections) return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) head_template_tr = None head_auto_tr = None noun_head_template = None saw_ndecl = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in hindi_head_templates: if noun_head_template and head_template_tr and not saw_ndecl: pagemsg( "WARNING: Missing declension for noun needing phonetic respelling, headtr=%s, autotr=%s: %s" % (",".join(head_template_tr), ",".join(head_auto_tr), unicode(noun_head_template))) if tn in ["hi-noun", "hi-proper noun"]: noun_head_template = t else: noun_head_template = None saw_ndecl = False head_template_tr = [] head_auto_tr = [] multi_trs = False for i in range(2, 10): if getparam(t, "tr%s" % i): multi_trs = True # We might have tr=some special translit and tr2=the default one, and in that case # we don't want to remove tr2= even though it appears redundant. pagemsg( "More than one translit, not removing any redundant ones: %s" % unicode(t)) break for i in range(1, 10): trparam = "tr" if i == 1 else "tr%s" % i origtr = getparam(t, trparam) tr = canonicalize_tr(origtr) if tr: headparam = "head" if i == 1 else "head%s" % i head = getparam(t, headparam) if head: head = blib.remove_links(head) else: head = pagetitle autotr = expand_text("{{xlit|hi|%s}}" % head) if autotr is not None: if autotr == tr and not multi_trs: assert i == 1 pagemsg( "WARNING: Removing redundant translit tr=%s for head %s" % (tr, head)) rmparam(t, "tr") notes.append("remove redundant tr=%s from {{%s}}" % (tr, tn)) else: head_template_tr.append(tr) head_auto_tr.append(autotr) pagemsg( "Page has non-redundant translit %s=%s vs. auto tr=%s in {{%s}}" % (trparam, tr, autotr, tn)) if origtr != tr: pagemsg("Canonicalizing %s=%s to %s: %s" % (trparam, origtr, tn, unicode(t))) t.add(trparam, tr) notes.append( "canonicalize %s=%s to %s in {{%s}}" % (trparam, origtr, tr, tn)) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) if tn == "hi-ndecl": saw_ndecl = True decl = getparam(t, "1") phon_respellings = re.findall("//([^<>, -]*)", decl) if head_template_tr is None: pagemsg("WARNING: Saw {{hi-ndecl}} before any headwords: %s" % unicode(t)) else: respelling_tr = [ expand_text("{{xlit|hi|%s}}" % x) for x in phon_respellings ] if None in respelling_tr: pagemsg( "WARNING: Error during phonetic respelling translit, skipping" ) continue respelling_tr = [x.replace(".", "") for x in respelling_tr] for phon_respelling in phon_respellings: if u"॰" in phon_respelling: pagemsg(u"WARNING: Saw ॰ in phon_respelling %s in %s" % (phon_respelling, unicode(t))) if head_template_tr and not phon_respellings: pagemsg( "WARNING: Missing phonetic respelling in %s, headtr=%s, autotr=%s" % (unicode(t), ",".join(head_template_tr), ",".join(head_auto_tr))) elif phon_respellings and not head_template_tr: pagemsg( "WARNING: Extra phonetic respelling %s (translit %s) in %s, no head tr" % (",".join(phon_respellings), ",".join(respelling_tr), unicode(t))) elif set(respelling_tr) != set(head_template_tr): pagemsg( "WARNING: Phonetic respelling %s (translit %s) in %s differs from head translit %s, auto translit %s" % (",".join(phon_respellings), ",".join(respelling_tr), unicode(t), ",".join(head_template_tr), ",".join(head_auto_tr))) elif phon_respellings: pagemsg( "Phonetic respelling %s (translit %s) in %s agrees with head translit %s, auto translit %s" % (",".join(phon_respellings), ",".join(respelling_tr), unicode(t), ",".join(head_template_tr), ",".join(head_auto_tr))) if noun_head_template and head_template_tr and not saw_ndecl: pagemsg( "WARNING: Missing declension for noun needing phonetic respelling, headtr=%s, autotr=%s: %s" % (",".join(head_template_tr), ",".join(head_auto_tr), unicode(noun_head_template))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "autocat": blib.set_template_name(t, "auto cat") notes.append("{{autocat}} -> {{auto cat}}") elif tn in [ "prefix cat", "suffix cat", "circumfix cat", "infix cat", "interfix cat" ]: m = re.search("^Category:(.*) ([a-z]+) ([a-z]+fix)ed with (.*)$", pagetitle) if not m: pagemsg("WARNING: Can't parse page title") continue langname, pos, affixtype, term_and_id = m.groups() m = re.search(r"^(.*?) \((.*)\)$", term_and_id) if m: term, id = m.groups() else: term, id = term_and_id, "" t_lang = getparam(t, "1") t_term = getparam(t, "2") t_alt = getparam(t, "3") t_pos = getparam(t, "pos") t_id = getparam(t, "id") t_tr = getparam(t, "tr") t_sort = getparam(t, "sort") t_sc = getparam(t, "sc") if langname not in blib.languages_byCanonicalName: pagemsg("WARNING: Unrecognized language name: %s" % langname) continue if blib.languages_byCanonicalName[langname]["code"] != t_lang: pagemsg( "WARNING: Auto-determined code %s for language name %s != manually specified %s" % (blib.languages_byCanonicalName[langname]["code"], langname, t_lang)) continue if tn[:-4] != affixtype: pagemsg( "WARNING: Auto-determined affix type %s != manually specified %s" % (affixtype, tn[:-4])) continue def add_missing_hyphens(alt): hyph_c = "([" + possible_hyphens + "])" m = re.search(r"^(\*)(.*)$", alt) if m: althyp, altbase = m.groups() else: althyp, altbase = "", alt m = re.search(r"^(\*)(.*)$", term) if m: termhyp, termbase = m.groups() else: termhyp, termbase = "", term if affixtype == "suffix": m = re.search("^" + hyph_c, termbase) if m: initial_hyphen = m.group(1) if not altbase.startswith(initial_hyphen): alt = althyp + initial_hyphen + altbase elif affixtype == "prefix": m = re.search(hyph_c + "$", termbase) if m: final_hyphen = m.group(1) if not altbase.endswith(final_hyphen): alt = althyp + altbase + final_hyphen elif affixtype in ["infix", "interfix"]: m = re.search("^" + hyph_c + ".*" + hyph_c + "$", termbase) if m: initial_hyphen, final_hyphen = m.groups() if not altbase.startswith(initial_hyphen): altbase = initial_hyphen + altbase if not altbase.endswith(final_hyphen): altbase = altbase + final_hyphen alt = althyp + altbase return alt orig_t_term = t_term t_term = add_missing_hyphens(t_term) already_checked_t_alt = False if t_term != term: manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, t_term)) if manual_entry_name != term: pagemsg( "WARNING: Can't match manually specified term %s (originally %s, entry name %s) to auto-determined term %s" % (t_term, orig_t_term, manual_entry_name, term)) continue if t_alt: pagemsg( "WARNING: Manually specified term %s has extra diacritics and alt=%s also specified, skipping" % (t_term, t_alt)) continue t_alt = t_term already_checked_t_alt = True if t_id != id: pagemsg( "WARNING: Auto-determined ID %s != manually specified %s" % (id, t_id)) continue if (pos == "words" and t_pos not in ["", "word", "words"] or pos != "words" and t_pos != pos and t_pos + "s" != pos and (not t_pos.endswith("x") or t_pos + "es" != pos)): pagemsg( "WARNING: Auto-determined pos %s doesn't match manually specified %s" % (pos, t_pos)) continue if t_alt and not already_checked_t_alt: orig_t_alt = t_alt t_alt = add_missing_hyphens(t_alt) manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, t_alt)) if manual_entry_name != term: pagemsg( "WARNING: Can't match manually specified alt %s (originally %s, entry name %s) to auto-determined term %s" % (t_alt, orig_t_alt, manual_entry_name, term)) continue if t_sort: auto_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, term)) autosort = expand_text( "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}" % (t_lang, auto_entry_name)) manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, add_missing_hyphens(t_sort))) manual_sort = expand_text( "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}" % (t_lang, manual_entry_name)) if manual_sort != autosort: pagemsg( "Keeping sort key %s because canonicalized sort key %s based on it not same as canonicalized sort key %s based on term %s" % (t_sort, manual_sort, autosort, term)) else: pagemsg( "Discarding sort key %s because canonicalized sort key %s based on it same as canonicalized sort key based on term %s" % (t_sort, manual_sort, term)) t_sort = "" must_continue = False all_existing_params = [ "1", "2", "3", "tr", "pos", "id", "tr", "sc", "sort" ] for param in t.params: pn = pname(param) if pn not in all_existing_params: pagemsg( "WARNING: Unrecognized param %s=%s in affix cat: %s" % (pn, unicode(param.value), unicode(t))) must_continue = True break if must_continue: continue for param in all_existing_params: rmparam(t, param) blib.set_template_name(t, "auto cat") if t_alt: if t_alt == term: pagemsg( "Not adding alt=%s because it's the same as the term" % t_alt) else: t.add("alt", t_alt) if t_tr: t.add("tr", t_tr) if t_sort: t.add("sort", t_sort) if t_sc: t.add("sc", t_sc) notes.append("convert {{%s}} to {{auto cat}}" % tn) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] if "sa-noun" not in text and "sa-decl-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) headt = None saw_decl = False for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "sa-noun": pagemsg("Saw headt=%s" % unicode(t)) if headt and not saw_decl: pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t))) headt = t saw_decl = False continue if tn in ["sa-decl-noun", "sa-decl"]: pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None)) continue if tn.startswith("sa-decl-noun-"): pagemsg("Saw declt=%s" % unicode(t)) if not headt: pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t))) continue saw_decl = True tr = getparam(headt, "tr") accented_tr = False if not tr: tr = expand_text("{{xlit|sa|%s}}" % pagetitle) pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t))) else: if "-" in tr: pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) tr = tr.replace("-", "") decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś") if AC not in decomptr and GR not in decomptr: pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) else: accented_tr = True genders = blib.fetch_param_chain(headt, "g") genders = [g.replace("-p", "").replace("bysense", "") for g in genders] genders = [g for gs in genders for g in ( ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs] )] if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]: tg = tn[-1] if tg not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( tg, ",".join(genders), unicode(headt), unicode(t))) continue decltr = getparam(t, "1") if not decltr: if not accented_tr: pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn)) else: pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add accented translit %s to {{%s}}" % (tr, tn)) elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari if not accented_tr: pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn)) else: pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn)) else: decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś") subbed = False if AC not in decompdecltr and GR not in decompdecltr: if accented_tr: pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" % (decltr, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn)) subbed = True else: pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) if not subbed and "-" in decltr: pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn)) decltr = decltr.replace("-", "") t.add("1", decltr) subbed = True stripped_decltr = decltr.strip() if "\n" not in decltr and stripped_decltr != decltr: pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn)) decltr = stripped_decltr t.add("1", decltr) subbed = True continue if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"): pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t))) continue if tn in old_template_to_gender: must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "4", "n"]: pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t), unicode(headt))) must_continue = True break if must_continue: continue g = old_template_to_gender[tn] if g not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( g, ",".join(genders), unicode(headt), unicode(t))) continue blib.set_template_name(t, "sa-decl-noun-%s" % g) rmparam(t, "n") rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") t.add("1", tr) notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g)) else: pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if headt: pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt)) return unicode(parsed), notes
def infer_one_page_decls_1(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) for tempname in decl_templates: for t in text.filter_templates(): if unicode(t.name).strip() == tempname: orig_template = unicode(t) args = infer_decl(t, pagemsg) if not args: # At least combine stem and declension, blanking decl when possible. stem, decl = combine_stem(getparam(t, "1"), getparam(t, "2")) t.add("1", stem) t.add("2", decl) # Remove any trailing blank arguments. for i in xrange(15, 0, -1): if not getparam(t, i): rmparam(t, i) else: break new_template = unicode(t) if orig_template != new_template: if not compare_results(orig_template, new_template, pagemsg): return None, None else: for i in xrange(15, 0, -1): rmparam(t, i) rmparam(t, "short_m") rmparam(t, "short_f") rmparam(t, "short_n") rmparam(t, "short_p") t.name = tempname i = 1 for arg in args: if "=" in arg: name, value = re.split("=", arg) t.add(name, value) else: t.add(i, arg) i += 1 new_template = unicode(t) if orig_template != new_template: if verbose: pagemsg("Replacing %s with %s" % (orig_template, new_template)) return text, "Convert adj decl to new form and infer short-accent pattern"
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None): if not expand_text: def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, False) tn = tname(t) if tn in [ "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det", "la-pronoun" ]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: retval = getparam(t, "1") if "<" in retval or "((" in retval or " " in retval or "-" in retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-adj-forms") blib.remove_param_chain(generate_template, "comp", "comp") blib.remove_param_chain(generate_template, "sup", "sup") blib.remove_param_chain(generate_template, "adv", "adv") blib.remove_param_chain(generate_template, "lemma", "lemma") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg_m" in args: retval = args["linked_nom_sg_m"] elif "linked_nom_pl_m" in args: retval = args["linked_nom_pl_m"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") else: retval = re.sub("/.*", "", retval) elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg" in args: retval = args["linked_nom_sg"] elif "linked_nom_pl" in args: retval = args["linked_nom_pl"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in ["la-verb", "la-suffix-verb"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-verb-forms") rmparam(generate_template, "id") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) for slot in [ "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc", "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc" ]: if slot in args: retval = args[slot] break else: # no break pagemsg( "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or ( tn in ["la-suffix", "la-suffix-adv", "la-gerund"]): retval = getparam(t, "1") elif tn == "la-letter": retval = pagename elif tn in ["head", "la-prep"]: retval = blib.fetch_param_chain(t, "head", "head") elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates: retval = blib.fetch_param_chain(t, "1", "head") else: pagemsg("WARNING: Unrecognized headword template %s" % unicode(t)) retval = "" retval = retval or pagename if type(retval) is not list: retval = [retval] return retval
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword and (not lemmas or pagetitle in lemmas): del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runoun.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runoun.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): if tname(t) == "RQ:Don Quixote" and getparam(t, "lang").strip() == "fr": origt = unicode(t) blib.set_template_name(t, "RQ:Cervantes Viardot Don Quichotte") rmparam(t, "lang") volume = getparam(t, "volume").strip() rmparam(t, "volume") if volume == "2": volume = "II" if not volume: volume = "I" chapter = getparam(t, "chapter").strip() rmparam(t, "chapter") text = getparam(t, "text").strip() or getparam(t, "passage").strip() rmparam(t, "text") rmparam(t, "passage") translation = getparam(t, "t").strip() or getparam( t, "translation").strip() rmparam(t, "t") rmparam(t, "translation") # Fetch all params. numbered_params = [] named_params = [] for param in t.params: pname = unicode(param.name) if re.search("^[0-9]+$", pname): numbered_params.append((pname, param.value, param.showkey)) else: named_params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] # Put numbered params in order. for name, value, showkey in numbered_params: t.add(name, value, showkey=showkey, preserve_spacing=False) t.add("volume", volume) if chapter: t.add("chapter", chapter) if text: t.add("text", text) if translation: t.add("t", translation) # Put named params in order. for name, value, showkey in named_params: t.add(name, value, showkey=showkey, preserve_spacing=False) notes.append( "Replace {{RQ:Don Quixote}} with {{RQ:Cervantes Viardot Don Quichotte}}" ) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return parsed, notes
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) saw_noun = None saw_proper_noun = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-noun": if saw_noun: pagemsg( "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_noun), unicode(t))) return saw_noun = t elif tn == "la-proper noun": if saw_proper_noun: pagemsg( "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_proper_noun), unicode(t))) return saw_proper_noun = t if saw_noun and saw_proper_noun: pagemsg( "WARNING: Saw both noun and proper noun, can't correct header/headword" ) return if not saw_noun and not saw_proper_noun: pagemsg( "WARNING: Saw neither noun nor proper noun, can't correct header/headword" ) return pos = "pn" if saw_proper_noun else "n" ht = saw_proper_noun or saw_noun if getparam(ht, "indecl"): pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht)) return generate_template = blib.parse_text(unicode(ht)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") return tempargs = blib.split_generate_args(result) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in tempargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for index, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, index, parsed): return process_form(page, index, slot, form, pos) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), index, handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname.startswith("ru-conj-") and tname != "ru-conj-verb-see": m = re.search("^ru-conj-(.*)$", tname) t.name = "ru-conj" conjtype = m.group(1) varargno = None variant = None if conjtype in ["3oa", "4a", "4b", "4c", "6a", "6c", "11a", "16a", "16b", u"irreg-дать", u"irreg-клясть", u"irreg-быть"]: varargno = 3 elif conjtype in ["5a", "5b", "5c", "6b", "9a", "9b", "11b", "14a", "14b", "14c"]: varargno = 4 elif conjtype in ["7b"]: varargno = 5 elif conjtype in ["7a"]: varargno = 6 if varargno: variant = getparam(t, str(varargno)) if re.search("^[abc]", variant): variant = "/" + variant if getparam(t, str(varargno + 1)) or getparam(t, str(varargno + 2)) or getparam(t, str(varargno + 3)): t.add(str(varargno), "") else: rmparam(t, str(varargno)) conjtype = conjtype + variant notes.append("ru-conj-* -> ru-conj, moving params up by one%s" % (variant and " (and move variant spec)" or "")) seenval = False for i in xrange(20, 0, -1): val = getparam(t, str(i)) if val: seenval = True if seenval: t.add(str(i + 1), val) t.add("1", conjtype) blib.sort_params(t) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) def check_bad_head(text, arg): canontext = re.sub(u"[׳’]", "'", blib.remove_links(text)) canonpagetitle = re.sub(u"[׳’]", "'", pagetitle) if canontext != canonpagetitle: pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" % (arg, canontext, canonpagetitle, arg, text)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name in fr_head_templates: head = getparam(t, "head") if head: linked_pagetitle = link_text(pagetitle) linked_head = link_text(head) if linked_pagetitle == linked_head: pagemsg("Removing redundant head=%s" % head) rmparam(t, "head") notes.append("remove redundant head= from {{%s}}" % name) else: pagemsg("Not removing non-redundant head=%s" % head) check_bad_head(head, "head") if name in fr_head_or_1_templates: head = getparam(t, "1") if head: linked_pagetitle = link_text(pagetitle) linked_head = link_text(head) if linked_pagetitle == linked_head: pagemsg("Removing redundant 1=%s" % head) rmparam(t, "1") notes.append("remove redundant 1= from {{%s}}" % name) else: pagemsg("Not removing non-redundant 1=%s" % head) check_bad_head(head, "1") newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def rewrite_one_page_ru_decl_adj(page, index, text): oldtemps = [] pagename = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) for t in text.filter_templates(): converted = True def tname(): return unicode(t.name).strip() origname = tname() origtemplate = unicode(t) if tname() == "ru-adj-table": t.name = "ru-decl-adj" else: if re.match("^ru-adjective[0-9]", tname()): t.name = tname().replace("ru-adjective", "ru-adj") if tname() == "ru-passive participle decl": t.name = "ru-adj1" suffix = None if tname() == "ru-adj3-sja": suffix = u"ся" t.name = "ru-adj3" elif tname() == "ru-adj5-suffix": suffix = "-" + getparam(t, "8") t.name = "ru-adj5" if tname() in ending_for_ru_adj: if tname() == "ru-adj13": addparam(t, "2", ending_for_ru_adj[tname()]) rmparam(t, "8") rmparam(t, "7") rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") rmparam(t, "3") elif tname() in ["ru-adj7", "ru-adj8", "ru-adj9", "ru-adj12"]: addparam(t, "1", getparam(t, "2").strip()) addparam(t, "2", ending_for_ru_adj[tname()]) rmparam(t, "8") rmparam(t, "7") rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") rmparam(t, "3") else: addparam(t, "1", getparam(t, "2").strip()) addparam(t, "2", ending_for_ru_adj[tname()]) mshort = clean(getparam(t, "3")) if mshort and re.search(u"[аяоеыи]$", remove_diacritics(mshort)): pagemsg("WARNING: short masculine %s doesn't have right ending" % mshort) fshort = clean(getparam(t, "4")) if fshort and not re.search(u"[ая]$", remove_diacritics(fshort)): pagemsg("WARNING: short feminine %s doesn't have right ending" % fshort) nshort = clean(getparam(t, "5")) if nshort and not re.search(u"[ое]$", remove_diacritics(nshort)): pagemsg("WARNING: short neuter %s doesn't have right ending" % nshort) pshort = clean(getparam(t, "6")) if pshort and not re.search(u"[ыи]$", remove_diacritics(pshort)): pagemsg("WARNING: short plural %s doesn't have right ending" % pshort) rmparam(t, "8") rmparam(t, "7") rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") rmparam(t, "3") if mshort: addparam(t, "3", mshort) # Note that fshort and nshort get reversed if nshort: addparam(t, "4", nshort) if fshort: addparam(t, "5", fshort) if pshort: addparam(t, "6", pshort) if suffix: addparam(t, "suffix", suffix) t.name = "ru-decl-adj" pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t))) else: converted = False if converted: oldtemps.append(origname) if oldtemps: comment = "convert %s -> ru-decl-adj" % ", ".join(oldtemps) else: comment = None return text, comment
def process_page(index, page, romaji_to_keep): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]: origt = unicode(t) # Remove old script code p1 = getparam(t, "1") if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]: pagemsg("Removing 1=%s: %s" % (p1, unicode(t))) notes.append("remove 1=%s from %s" % (p1, tname)) rmparam(t, "1") for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): param.name = str(int(pname) - 1) param.showkey = False # Convert hira= and/or kata= to numbered param. The complexity is # from ensuring that the numbered params always go before the # non-numbered ones. if t.has("hira") or t.has("kata"): # Fetch the numbered and non-numbered params, skipping blank # numbered ones and converting hira and kata to numbered numbered_params = [] non_numbered_params = [] for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): val = unicode(param.value) if val: numbered_params.append(val) elif pname not in ["hira", "kata"]: non_numbered_params.append((pname, param.value)) hira = getparam(t, "hira") if hira: numbered_params.append(hira) pagemsg("Moving hira=%s to %s=: %s" % (hira, len(numbered_params), unicode(t))) notes.append("move hira= to %s= in %s" % (len(numbered_params), tname)) kata = getparam(t, "kata") if kata: numbered_params.append(kata) pagemsg("Moving kata=%s to %s=: %s" % (kata, len(numbered_params), unicode(t))) notes.append("move kata= to %s= in %s" % (len(numbered_params), tname)) del t.params[:] # Put back numbered params, then non-numbered params. for i, param in enumerate(numbered_params): t.add(str(i + 1), param) for name, value in non_numbered_params: t.add(name, value) # Remove rom= if not in list of pages to keep rom= if t.has("rom"): if pagetitle in romaji_to_keep: pagemsg("Keeping rom=%s because in romaji_to_keep: %s" % (getparam(t, "rom"), unicode(t))) else: pagemsg("Removing rom=%s: %s" % (getparam(t, "rom"), unicode(t))) rmparam(t, "rom") notes.append("remove rom= from %s" % tname) # Remove hidx= if t.has("hidx"): pagemsg("Removing hidx=%s: %s" % (getparam(t, "hidx"), unicode(t))) rmparam(t, "hidx") notes.append("remove hidx= from %s" % tname) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(index, page, save, verbose, fix_missing_plurals): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name == "head" and getparam(t, "1") == "fr": headtype = getparam(t, "2") fixed_plural_warning = False if headtype == "noun": head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") plural = "" if getparam(t, "3") == "plural": plural = getparam(t, "4") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort"] or plural and pname in ["3", "4"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue found_feminine_noun = False if g == "f" and not g2 and not plural: for tt in parsed.filter_templates(): if (unicode(tt.name) == "feminine noun of" and getparam(tt, "lang") == "fr"): found_feminine_noun = True if found_feminine_noun: pagemsg("Found 'feminine noun of', assuming countable") elif g not in ["m-p", "f-p"] and not plural: if fix_missing_plurals: pagemsg("WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW" % unicode(t)) fixed_plural_warning = True else: pagemsg("WARNING: No plural given in %s, skipping" % unicode(t)) continue rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) if plural: t.add("2", plural) elif headtype in ["proper noun", "proper nouns"]: head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") remove_3 = False if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]: g = getparam(t, "3") remove_3 = True unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort"] or remove_3 and pname in ["3"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-proper noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) elif headtype in ["adjective", "adjectives"]: if getparam(t, "3") in ["invariable", "invariant"]: params = dict((unicode(p.name), unicode(p.value)) for p in t.params) del params["1"] del params["2"] del params["3"] if getparam(t, "g") == "m" and getparam(t, "g2") == "f": del params["g"] del params["g2"] if not params: rmparam(t, "g2") rmparam(t, "g") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") t.name = "fr-adj" t.add("inv", "y") else: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) else: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) elif headtype in ["adjective form", "verb form", "verb forms", "interjection", "preposition", "prefix", "prefixes", "suffix", "suffixes"]: headtype_supports_g = headtype in [ "adjective form", "suffix", "suffixes"] head = getparam(t, "head") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "sort"] or headtype_supports_g and pname == "g": pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue rmparam(t, "sort") rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") t.name = ("fr-adj-form" if headtype == "adjective form" else "fr-verb-form" if headtype in ["verb form", "verb forms"] else "fr-intj" if headtype == "interjection" else "fr-prep" if headtype == "preposition" else "fr-prefix" if headtype in ["prefix", "prefixes"] else "fr-suffix" # if headtype in ["suffix", "suffixes"] ) if head: t.add("head", head) newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append("replaced {{head|fr|%s}} with {{%s}}%s" % (headtype, unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else "")) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg, "save page") else: pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed, move_dot, rename): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) if ":" in pagetitle and not re.search( "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle): pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page") return None, None if move_dot: templates_to_replace = [] for t in parsed.filter_templates(): tn = tname(t) if tn in all_he_form_of_templates: dot = getparam(t, ".") if dot: origt = unicode(t) rmparam(t, ".") newt = unicode(t) + dot templates_to_replace.append((origt, newt)) for curr_template, repl_template in templates_to_replace: found_curr_template = curr_template in text if not found_curr_template: pagemsg("WARNING: Unable to locate template: %s" % curr_template) continue found_repl_template = repl_template in text if found_repl_template: pagemsg("WARNING: Already found template with period: %s" % repl_template) continue newtext = text.replace(curr_template, repl_template) newtext_text_diff = len(newtext) - len(text) repl_curr_diff = len(repl_template) - len(curr_template) ratio = float(newtext_text_diff) / repl_curr_diff if ratio == int(ratio): if int(ratio) > 1: pagemsg("WARNING: Replaced %s occurrences of curr=%s with repl=%s" % (int(ratio), curr_template, repl_template)) else: pagemsg("WARNING: Something wrong, length mismatch during replacement: Expected length change=%s, actual=%s, ratio=%.2f, curr=%s, repl=%s" % (repl_curr_diff, newtext_text_diff, ratio, curr_template, repl_template)) text = newtext notes.append("move .= outside of {{he-*}} template") if rename: parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in all_he_form_of_template_map: newname, add_nocap = all_he_form_of_template_map[tn] add_nocap_msg = "|nocap=1" if add_nocap else "" newspecs = None if "|" in newname: newname, newspecs = newname.split("|") blib.set_template_name(t, newname) # Fetch all params. params = [] old_1 = getparam(t, "1") for param in t.params: pname = unicode(param.name) if pname.strip() in ["1", "lang", "sc"]: continue if pname.strip() in ( newname == "he-infinitive of" and ["3", "4"] or ["2", "3", "4"] ): errandmsg("WARNING: Found %s= in %s" % (pname.strip(), origt)) params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] # Put back basic params t.add("1", old_1) if newname == "he-verb form of": assert newspecs t.add("2", newspecs) notes.append("rename {{%s}} to {{%s|{{{1}}}|%s%s}}" % (tn, newname, newspecs, add_nocap_msg)) elif newname == "he-noun form of" and newspecs: newparam, newval = newspecs.split("=") t.add(newparam, newval) notes.append("rename {{%s}} to {{%s|{{{1}}}|%s=%s%s}}" % (tn, newname, newparam, newval, add_nocap_msg)) else: notes.append("rename {{%s}} to {{%s%s}}" % (tn, newname, add_nocap_msg)) # Put remaining parameters in order. for name, value, showkey in params: # More hacking for 'he-form of sing cons': p -> pp, g -> pg, n -> pn if newname == "he-noun form of" and newspecs: if name in ["p", "g", "n"]: name = "p" + name t.add(name, value, showkey=showkey, preserve_spacing=False) # Finally add nocap=1 if requested. if add_nocap: t.add("nocap", "1") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) text = unicode(parsed) return text, notes
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] already_canonicalized = False found_short_inflection_of = False warned_about_short = False foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'inflection of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): numbered_params.append(getparam(t, str(i))) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) canon_params = [] while True: m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", m.group(1), "s"] break m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", "p"] break if "short" in numbered_params or "short form" in numbered_params: found_short_inflection_of = True warned_about_short = True pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" % unicode(t)) break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) already_canonicalized = True sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified singular inflection def add_sing_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for short/%s/s" % gender) return retval newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of, sections[j], 0, re.I) if newsec != sections[j]: found_short_inflection_of = True sections[j] = newsec if "short" in sections[j] and not found_short_inflection_of: m = re.search("^(.*short.*)$", sections[j], re.M) warned_about_short = True pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" % (m and m.group(1) or "Can't get line?")) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) if not notes and not already_canonicalized: pagemsg("Skipping, no short form found%s" % ( warned_about_short and " (warning issued)" or " (no warning)"))
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "ang" and getparam( t, "2") in ["verb", "verbs"]: for param in t.params: pn = pname(param) if pn not in ["1", "2", "head"]: pagemsg("WARNING: head|ang|verb with extra params: %s" % unicode(t)) break else: # no break blib.set_template_name(t, "ang-verb") rmparam(t, "1") rmparam(t, "2") notes.append("convert {{head|ang|verb}} into {{ang-verb}}") head = getparam(t, "head") if head: t.add("1", head) rmparam(t, "head") elif tn == "ang-verb": head = getparam(t, "head") head2 = getparam(t, "head2") head3 = getparam(t, "head3") rmparam(t, "head") rmparam(t, "head2") rmparam(t, "head3") if head: t.add("1", head) if head2: t.add("head2", head2) if head3: t.add("head3", head3) notes.append("move head= to 1= in {{ang-verb}}") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def fix_up_section(sectext): parsed = blib.parse_text(sectext) pronun_templates = [] verb_templates = [] nonverb_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn in french_nonverb_head_templates: nonverb_templates.append(t) elif tn in french_verb_head_templates: verb_templates.append(t) elif tn == "head": if getparam(t, "1").strip() != "fr": pagemsg( "WARNING: Saw wrong-language {{head}} template: %s" % unicode(t)) else: pos = getparam(t, "2").strip() if pos in french_verb_head_pos: verb_templates.append(t) else: nonverb_templates.append(t) if verb_templates and nonverb_templates: pagemsg( "WARNING: Saw both verb template(s) %s and non-verb template(s) %s, using pos=vnv" % (",".join(unicode(x) for x in verb_templates), ",".join( unicode(x) for x in nonverb_templates))) if not verb_templates and not nonverb_templates: pagemsg("WARNING: Didn't see any French templates") for t in parsed.filter_templates(): tn = tname(t) if tn == "IPA": m = re.search("^.*?%s.*$" % re.escape(unicode(t)), sectext, re.M) if not m: pagemsg( "WARNING: Couldn't find template %s in section text" % unicode(t)) line = "(unknown)" else: line = m.group(0) if t.has("lang"): first_param = 1 lang = getparam(t, "lang") else: first_param = 2 lang = getparam(t, "1") if lang != "fr": pagemsg( "WARNING: Saw wrong-language {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue pron = getparam(t, str(first_param)) if not pron: pagemsg( "WARNING: No pronun in {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue if getparam(t, str(first_param + 1)) or getparam( t, str(first_param + 2)) or getparam( t, str(first_param + 3)): pagemsg( "WARNING: Multiple pronuns in {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue pos_val = ("vnv" if verb_templates and nonverb_templates else "v" if verb_templates else "") pos_arg = "|pos=%s" % pos_val if pos_val else "" #autopron = expand_text("{{#invoke:User:Benwing2/fr-pron|show|%s%s}}" % ( autopron = expand_text("{{#invoke:fr-pron|show|%s%s}}" % (pagetitle, pos_arg)) if not autopron: continue pron = re.sub("^/(.*)/$", r"\1", pron) pron = re.sub(r"^\[(.*)\]$", r"\1", pron) pron = pron.strip() pron = pron.replace("r", u"ʁ") # account for various common errors in Dawnraybot's generated pronunciations: # #1 if pagetitle.endswith("rez") and pron.endswith(u"ʁɔe"): pron = re.sub(u"ʁɔe$", u"ʁe", pron) # #2 if re.search("ai(s|t|ent)$", pagetitle) and pron.endswith(u"e"): pron = re.sub(u"e$", u"ɛ", pron) # #3 if pos_val == "v" and pagetitle.endswith( "ai") and pron.endswith(u"ɛ"): pron = re.sub(u"ɛ$", u"e", pron) if "." not in pron: autopron = autopron.replace(".", "") if autopron.endswith(u"ɑ") and pron.endswith("a"): autopron = autopron[:-1] + "a" if re.search(ur"ɑ[mt]$", autopron) and re.search( u"a[mt]$", pron): autopron = re.sub(ur"ɑ([mt])$", r"a\1", autopron) for i in xrange(2): # {{fr-IPA}} deletes schwa in the sequence V.Cə.CV esp. in the # sequence V.Cə.ʁV in verbs, whereas the bot-generated pronunciation # doesn't. We have separate cases depending on the identity of C, # which may go before or after the syllable break. Do it twice in # case it occurs twice in a row in a single word. pron = re.sub( ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([jlmnɲwʃʒ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])", r"\1\2.\3", pron) pron = re.sub( ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([szfvtdpbkɡ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])", r"\1.\2\3", pron) # {{fr-IPA}} converts sequences of Crj and Clj to Cri.j and Cli.j, # which is correct, but Dawnraybot doesn't do that. pron = re.sub(u"([szfvtdpbkɡ][ʁl])j", r"\1i.j", pron) allow_mismatch = False if pron != autopron: tempcall = "{{fr-IPA%s}}" % pos_arg if pron.replace(u"ɑ", "a") == autopron.replace(u"ɑ", "a"): pagemsg( u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɑ vs. a only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) elif re.sub(u"ɛ(.)", r"e\1", pron) == re.sub(u"ɛ(.)", r"e\1", autopron): pagemsg( u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɛ vs. e only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) elif pron.replace(".", "") == autopron.replace(".", ""): pagemsg( "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable division only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) allow_mismatch = True elif pron.replace(".", "").replace(" ", "") == autopron.replace( ".", "").replace(" ", ""): pagemsg( "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable/word division only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) else: pagemsg( "WARNING: Can't replace %s with %s because auto-generated pron %s doesn't match %s: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) if not allow_mismatch: continue origt = unicode(t) rmparam(t, "lang") rmparam(t, "1") rmparam(t, str(first_param)) blib.set_template_name(t, "fr-IPA") if pos_val: t.add("pos", pos_val) notes.append( "replace manually-specified {{IPA|fr}} pronun with {{fr-IPA}}" ) pagemsg("Replaced %s with %s: line <%s>" % (origt, unicode(t), line)) if "{{a|" in line: pagemsg( "WARNING: Replaced %s with %s on a line with an accent spec: line <%s>" % (origt, unicode(t), line))
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [ "verb", "verbs", "adjective", "adjectives"]: pos = getparam(t, "2") if pos in ["verb", "verbs"]: newtn = "bg-verb" else: newtn = "bg-adj" params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if (pname not in ["1", "2", "head", "g"] or pname == "g" and (newtn != "bg-adj" or pval != "m")): pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt)) break else: # no break rmparam(t, "1") rmparam(t, "2") rmparam(t, "g") head = getparam(t, "head") rmparam(t, "head") blib.set_template_name(t, newtn) t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle)) notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn)) elif tn == "bg-verb" or tn == "bg-adj": if tn == "bg-adj": g = getparam(t, "g") if g and g != "m": pagemsg("WARNING: Saw g=%s in %s" % (g, origt)) continue if t.has("g"): rmparam(t, "g") notes.append("remove g=%s from {{%s}}" % (g, tn)) head = getparam(t, "head") or getparam(t, "1") rmparam(t, "head") rmparam(t, "1") a = getparam(t, "a") or getparam(t, "2") rmparam(t, "a") rmparam(t, "2") if a in ["impf-pf", "pf-impf", "dual", "ip", "both"]: a = "both" elif a and a not in ["impf", "pf"]: pagemsg("WARNING: Unrecognized aspect %s in %s" % (a, origt)) params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if not pval: continue params.append((pname, pval, showkey)) # Erase all params. del t.params[:] # Put back new params. t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle)) notes.append("move head= to 1= in {{%s}}" % tn) if a: t.add("2", a) notes.append("move a= to 2= in {{%s}}" % tn) for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [ "noun", "nouns", "proper noun", "proper nouns"]: pos = getparam(t, "2") params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if (pname not in ["1", "2", "head", "g", "g2", "g3", "3", "4", "5", "6", "7", "8", "9", "10"] or pname == "3" and pval not in ["masculine", "feminine"] or pname in ["5", "7", "9"] and pval != "or"): pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt)) break else: # no break rmparam(t, "1") rmparam(t, "2") m = [] f = [] head = getparam(t, "head") rmparam(t, "head") genders = [] def process_gender(g): if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]: genders.append(g) else: pagemsg("WARNING: Unrecognized gender '%s'" % g) g = getparam(t, "g") if g: process_gender(g) rmparam(t, "g") g2 = getparam(t, "g2") if g2: process_gender(g2) rmparam(t, "g2") g3 = getparam(t, "g3") if g3: process_gender(g3) rmparam(t, "g3") def handle_mf(array): array.append(getparam(t, "4")) rmparam(t, "3") rmparam(t, "4") i = 5 while getparam(t, str(i)) == "or": array.append(getparam(t, str(i + 1))) rmparam(t, str(i)) rmparam(t, str(i + 1)) i += 2 if getparam(t, "3") == "masculine": handle_mf(m) if getparam(t, "3") == "feminine": handle_mf(f) if pos in ["noun", "nouns"]: newtn = "bg-noun" else: newtn = "bg-proper noun" blib.set_template_name(t, newtn) t.add("1", head or pagetitle) blib.set_param_chain(t, genders, "2", "g") if m: blib.set_param_chain(t, m, "m", "m") if f: blib.set_param_chain(t, f, "f", "f") notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn)) elif tn in ["bg-noun", "bg-proper noun"]: g = None cur1 = getparam(t, "1") if cur1 in ["m", "f"]: g = cur1 elif re.search("[a-zA-Z]", cur1): pagemsg("WARNING: Saw Latin in 1=%s in %s" % (cur1, origt)) continue head = getparam(t, "head") or getparam(t, "sg") rmparam(t, "head") rmparam(t, "sg") genders = [] def process_gender(g): if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]: genders.append(g) elif g in ["mf", "fm"]: genders.append("m") genders.append("f") elif g in ["mn", "nm"]: genders.append("m") genders.append("n") elif g in ["fn", "nf"]: genders.append("f") genders.append("n") elif g in ["mfn", "fmn", "mnf", "nmf", "fnm", "nfm"]: genders.append("m") genders.append("f") genders.append("n") else: pagemsg("WARNING: Unrecognized gender '%s'" % g) if g: process_gender(g) rmparam(t, "1") g = getparam(t, "2") if g: process_gender(g) g = getparam(t, "g") if g: process_gender(g) rmparam(t, "g") g2 = getparam(t, "g2") if g2: process_gender(g2) rmparam(t, "g2") g3 = getparam(t, "g3") if g3: process_gender(g3) rmparam(t, "g3") params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if not pval: continue params.append((pname, pval, showkey)) # Erase all params. del t.params[:] # Put back new params. t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle)) blib.set_param_chain(t, genders, "2", "g") for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if origt != unicode(t): notes.append("move head=/sg= to 1=, g= to 2= in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def process_page(page, index, parsed):: pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if not page.exists(): pagemsg("WARNING: Page doesn't exist") return if ":" in pagetitle and not re.search( "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle): pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page") return text = unicode(page.text) notes = [] subsections = re.split("(^==.*==\n)", text, 0, re.M) newtext = text def move_param(t, fr, to, frob_from=None): if t.has(fr): oldval = getparam(t, fr) if not oldval.strip(): rmparam(t, fr) pagemsg("Removing blank param %s" % fr) return if frob_from: newval = frob_from(oldval) if not newval or not newval.strip(): return else: newval = oldval if getparam(t, to).strip(): pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s" % (fr, to, to, unicode(t))) elif oldval != newval: rmparam(t, to) # in case of blank param # If either old or new name is a number, use remove/add to automatically set the # showkey value properly; else it's safe to just change the name of the param, # which will preserve its location. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: tfr = t.get(fr) tfr.name = to tfr.value = newval pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to, newval.replace("\n", r"\n"))) else: rmparam(t, to) # in case of blank param # See comment above. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: t.get(fr).name = to pagemsg("%s -> %s" % (fr, to)) def fix_page_params(t): origt = unicode(t) for param in ["page", "pages"]: pageval = getparam(t, param) if re.search(r"^\s*pp?\.\s*", pageval): pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval) t.add(param, pageval) notes.append("remove p(p). from %s=" % param) pagemsg("remove p(p). from %s=" % param) if re.search(r"^[0-9]+$", getparam(t, "pages").strip()): move_param(t, "pages", "page") if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()): move_param(t, "page", "pages") return origt != unicode(t) def fix_cite_book_params(t): origt = unicode(t) if getparam(t, "origyear").strip() and getparam(t, "year").strip(): if getparam(t, "year_published"): pagemsg("WARNING: Would set year_published= but is already present: %s" % unicode(t)) else: rmparam(t, "year_published") # in case of blank param t.get("year").name = "year_published" t.get("origyear").name = "year" pagemsg("year -> year_published, origyear -> year") move_param(t, "origdate", "date") move_param(t, "origmonth", "month") def frob_isbn(idval): isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)" if re.search(isbn_re, idval, re.I): return re.sub(isbn_re, r"\1", idval, 0, re.I) elif re.search(r"^[0-9]", idval.strip()): return idval else: pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" % idval.replace("\n", r"\n")) return None move_param(t, "id", "isbn", frob_isbn) fix_page_params(t) return origt != unicode(t) def fix_cite_usenet_params(t): origt = unicode(t) move_param(t, "group", "newsgroup") move_param(t, "link", "url") return origt != unicode(t) def fix_quote_usenet_params(t): origt = unicode(t) monthday = getparam(t, "monthday").strip() year = getparam(t, "year").strip() if monthday and year: if getparam(t, "date"): pagemsg("WARNING: Would set date= but is already present: %s" % unicode(t)) else: rmparam(t, "date") # in case of blank param param = t.get("monthday") param.name = "date" if re.search("^[0-9]+/[0-9]+$", monthday): param.value = "%s/%s" % (monthday, year) else: param.value = "%s %s" % (monthday, year) rmparam(t, "year") pagemsg("monthday/year -> date") move_param(t, "group", "newsgroup") move_param(t, "text", "passage") move_param(t, "6", "passage") move_param(t, "5", "url") move_param(t, "4", "newsgroup") move_param(t, "3", "title") move_param(t, "2", "author") move_param(t, "1", "date") return origt != unicode(t) def replace_in_reference(parsed, in_what): for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "cite-journal", tname) pagemsg("%s -> cite-journal" % tname.strip()) notes.append("%s -> cite-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) if tname.strip() == "reference-book": set_template_name(t, "cite-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) for j in xrange(0, len(subsections), 2): parsed = blib.parse_text(subsections[j]) if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]): replace_in_reference(parsed, "==References== section") subsections[j] = unicode(parsed) else: for t in parsed.filter_tags(): if unicode(t.tag) == "ref": tagparsed = mw.wikicode.Wikicode([t]) replace_in_reference(tagparsed, "<ref>") subsections[j] = unicode(parsed) need_to_replace_double_quote_prefixes = False for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) for fr, to in simple_replace: if tname.strip() == fr: set_template_name(t, to, tname) pagemsg("%s -> %s" % (fr, to)) notes.append("%s -> %s" % (fr, to)) fix_page_params(t) pagemsg("Replacing %s with %s" % (origt, unicode(t))) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "quote-journal", tname) pagemsg("%s -> quote-journal" % tname.strip()) notes.append("%s -> quote-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() == "reference-book": set_template_name(t, "quote-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() in ["cite-usenet", "quote-usenet"]: if tname.strip() == "cite-usenet": fixed_params = fix_cite_usenet_params(t) else: fixed_params = fix_quote_usenet_params(t) set_template_name(t, "quote-newsgroup", tname) pagemsg("%s -> quote-newsgroup" % tname.strip()) prefix = getparam(t, "prefix").strip() removed_prefix = False if prefix: if prefix in ["#", "#*"]: parsed.insert_before(t, "#* ") rmparam(t, "prefix") pagemsg("remove prefix=%s, insert #* before template" % prefix) need_to_replace_double_quote_prefixes = True removed_prefix = True else: pagemsg("WARNING: Found prefix=%s, not # or #*: %s" % (prefix, unicode(t))) notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(), removed_prefix and ", remove prefix=%s, insert #* before template" % prefix or "", fixed_params and ", fix params" or "")) pagemsg("Replacing %s with %s" % (origt, unicode(t))) subsections[j] = unicode(parsed) if need_to_replace_double_quote_prefixes: newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M) if newval != subsections[j]: notes.append("remove double #* prefix") pagemsg("Removed double #* prefix") subsections[j] = newval return "".join(subsections), notes
def getrmparam(t, param): value = getparam(t, param) rmparam(t, param) return value
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): if tname(t) == "bg-noun-form": origt = unicode(t) must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "head"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue rmparam(t, "1") rmparam(t, "2") head = getparam(t, "head") rmparam(t, "head") g = getparam(t, "3") rmparam(t, "3") blib.set_template_name(t, "head") t.add("1", "bg") t.add("2", "noun form") if head: t.add("head", head) else: if bglib.needs_accents(pagetitle): pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" % unicode(t)) else: t.add("head", pagetitle) if g: t.add("g", g) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}") headt = None saw_infl_after_head = False saw_headt = False saw_inflt = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) saw_infl = False already_fetched_forms = False if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form": saw_headt = True if headt and not saw_infl_after_head: pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % ( unicode(headt), origt)) saw_infl_after_head = False headt = t if tn == "bg-noun form of": saw_inflt = True if not headt: pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt) continue must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "noun"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue saw_infl_after_head = True noun = getparam(t, "noun") if not noun: pagemsg("WARNING: Didn't see noun=: %s" % origt) continue infls = [] param2 = getparam(t, "2") if param2 == "indefinite": infls.append("indef") elif param2 == "definite": infls.append("def") elif param2 == "vocative": infls.append("voc") elif param2: pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt)) continue param3 = getparam(t, "3") if param3 == "subject": infls.append("sbjv") elif param3 == "object": infls.append("objv") elif param3: pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt)) continue param1 = getparam(t, "1") if param1 == "singular": infls.append("s") elif param1 == "plural": infls.append("p") elif param1 == "count": infls.extend(["count", "form"]) elif param1 == "vocative": infls.extend(["voc", "s"]) else: pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt)) continue blib.set_template_name(t, "inflection of") del t.params[:] t.add("1", "bg") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) t.add("2", noun) else: t.add("2", lemma) t.add("3", "") for i, infl in enumerate(infls): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{bg-noun form of}} to {{inflection of}}") tn = tname(t) saw_infls = infls_to_slot(infls) already_fetched_forms = True if not saw_infls: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "inflection of" and getparam(t, "1") == "bg": saw_inflt = True infls = [] i = 4 while True: infl = getparam(t, str(i)) if not infl: break infls.append(infl) i += 1 saw_infls = infls_to_slot(infls) if not saw_infls: if "vnoun" in infls: pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt)) elif "part" in infls: pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt)) else: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "definite singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_sg" elif tn == "indefinite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "ind_pl" elif tn == "definite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_pl" elif tn == "vocative singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "voc_sg" if saw_infl: if not already_fetched_forms: noun = getparam(t, "2") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) continue t.add("2", lemma) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn)) if saw_infl == "def_sg": def_sub_sg = forms.get("def_sub_sg", None) def_obj_sg = forms.get("def_obj_sg", None) if def_sub_sg != def_obj_sg: pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % ( def_sub_sg, def_obj_sg)) continue form = def_sub_sg else: form = forms.get(saw_infl, None) if not form: pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" % (saw_infl, format_forms(forms))) continue form = form.split(",") filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle] if not filtered_form: pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form))) continue form = filtered_form existing_form = blib.fetch_param_chain(headt, "head", "head") if existing_form: must_continue = False for f in existing_form: if bglib.remove_accents(f) != pagetitle: pagemsg("WARNING: Existing head %s doesn't match page title: %s" % ( f, unicode(headt))) must_continue = True break if must_continue: continue needs_accents = [bglib.needs_accents(f) for f in existing_form] if any(needs_accents) and not all(needs_accents): pagemsg("WARNING: Some but not all existing heads missing accents: %s" % unicode(headt)) continue if not any(needs_accents): if existing_form != form: pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % ( saw_infl, ",".join(existing_form), ",".join(form))) continue origheadt = unicode(headt) blib.set_param_chain(headt, form, "head", "head") pagemsg("Replaced %s with %s" % (origheadt, unicode(headt))) notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form))) if saw_headt and not saw_inflt: pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt)) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in template_to_infl_codes and getparam(t, "1") == "bg": must_continue = False for param in t.params: if pname(param) not in ["1", "2"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue infl_codes = template_to_infl_codes[tn] blib.set_template_name(t, "inflection of") t.add("3", "") for i, infl in enumerate(infl_codes): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{inflection of}}" % tn) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam( t, "2") == "noun form": if getparam(t, "3"): pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" % unicode(t)) return rmparam(t, "1") rmparam(t, "2") head = getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return t.name = "ru-noun form" if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append( "convert {{head|ru|noun form}} to {{ru-noun form}}") elif unicode(t.name) == "ru-noun form": if getparam(t, "head") and getparam(t, "1"): pagemsg( "WARNING: ru-noun form has both params 1= and head=: %s" % unicode(t)) return if getparam(t, "g") and getparam(t, "2"): pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" % unicode(t)) return head = getrmparam(t, "1") or getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "2") or getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalize ru-noun form") return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if tname(t) == "prefixusex": if getparam(t, "1").endswith("-") or getparam(t, "2").endswith("-"): pagemsg("WARNING: Has prefix as term: %s" % origt) if tname(t) == "suffixusex": if getparam(t, "1").startswith("-") or getparam(t, "2").startswith("-"): pagemsg("WARNING: Has suffix as term: %s" % origt) if tname(t) in ["prefixusex", "suffixusex"]: if getparam(t, "lang"): pagemsg("WARNING: Uses lang= param: %s" % origt) lang = getparam(t, "lang") term1 = getparam(t, "1") term2 = getparam(t, "2") altsuf = getparam(t, "altsuf") altpref = getparam(t, "altpref") t1 = getparam(t, "t1") or getparam(t, "gloss1") t2 = getparam(t, "t2") or getparam(t, "gloss2") alt1 = getparam(t, "alt1") alt2 = getparam(t, "alt2") pos1 = getparam(t, "pos1") pos2 = getparam(t, "pos2") # Fetch remaining non-numbered params. non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "t1", "gloss1", "t2", "gloss2", "alt1", "alt2", "pos1", "pos2", "altpref", "altsuf"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back params in proper order, then the remaining non-numbered params. t.add("1", lang) if altpref: t.add("altpref", altpref) if term1: t.add("2", term1) if alt1: t.add("alt1", alt1) if pos1: t.add("pos1", pos1) if t1: t.add("t1", t1) if altsuf: t.add("altsuf", altsuf) if term2: t.add("3", term2) if alt2: t.add("alt2", alt2) if pos2: t.add("pos2", pos2) if t2: t.add("t2", t2) for name, value in non_numbered_params: t.add(name, value) notes.append("Move lang= to 1= in prefixusex/suffixusex") if getparam(t, "inline"): rmparam(t, "inline") notes.append("Remove inline= in prefixusex/suffixusex") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found ru-noun+ or ru-proper noun+, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, [] noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] frobbed_manual_translit = [] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) # Retrieve headword translit and maybe transfer to decl headword_tr = getparam(headword_template, "tr") if headword_tr: if verbose: pagemsg("Found headword manual translit tr=%s" % headword_tr) if "," in headword_tr: pagemsg("WARNING: Comma in headword manual translit, skipping: %s" % headword_tr) return None # Punt if multi-arg-set, can't handle yet for decl_template in decl_templates: for param in decl_template.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template)) return None if val == "-" or val == "_" or val.startswith("join:"): pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template)) return None for i in xrange(2, 10): if getparam(headword_template, "tr%s" % i): pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % ( i, unicode(headword_template))) return None if runoun.arg1_is_stress(getparam(decl_template, "1")): lemma_arg = "2" else: lemma_arg = "1" lemmaval = getparam(decl_template, lemma_arg) if not lemmaval: lemmaval = subpagetitle if "//" in lemmaval: m = re.search("^(.*?)//(.*)$", lemmaval) if m.group(2) != headword_tr: pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % ( lemmaval, headword_tr)) return None else: pagemsg("Already found manual translit in decl template %s" % lemmaval) else: lemmaval += "//" + headword_tr orig_decl_template = unicode(decl_template) decl_template.add(lemma_arg, lemmaval) pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) frobbed_manual_translit = [headword_tr] genders = blib.fetch_param_chain(headword_template, "2", "g") bian_replaced = 0 # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in # headword template for decl_template in decl_templates: if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]: saw_in = -1 saw_an = -1 for i,g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0: orig_decl_template = unicode(decl_template) if saw_in < saw_an: pagemsg("Replacing a=bi with a=ia in decl template") decl_template.add("a", "ia") bian_replaced = 1 else: pagemsg("Replacing a=bi with a=ai in decl template") decl_template.add("a", "ai") bian_replaced = 1 pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) genders = runoun.check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg) if genders == None: return None new_params = [] for param in noun_table_template.params: new_params.append((param.name, param.value)) orig_headword_template = unicode(headword_template) params_to_preserve = runoun.fix_old_headword_params(headword_template, new_params, genders, pagemsg) if params_to_preserve == None: return None if unicode(headword_template.name) == "ru-proper noun": # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(headword_template, "n"): pagemsg("Adding n=both to headword tempate") headword_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) else: pagemsg("Removing n=sg from headword tempate") rmparam(headword_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) headword_template.params.extend(params_to_preserve) ru_noun_changed = 0 ru_proper_noun_changed = 0 if unicode(headword_template.name) == "ru-noun": headword_template.name = "ru-noun+" ru_noun_changed = 1 else: headword_template.name = "ru-proper noun+" ru_proper_noun_changed = 1 pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template))) return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname.startswith("ru-conj-") and tname != "ru-conj-verb-see": m = re.search("^ru-conj-(.*)$", tname) t.name = "ru-conj" conjtype = m.group(1) varargno = None variant = None if conjtype in [ "3oa", "4a", "4b", "4c", "6a", "6c", "11a", "16a", "16b", u"irreg-дать", u"irreg-клясть", u"irreg-быть" ]: varargno = 3 elif conjtype in [ "5a", "5b", "5c", "6b", "9a", "9b", "11b", "14a", "14b", "14c" ]: varargno = 4 elif conjtype in ["7b"]: varargno = 5 elif conjtype in ["7a"]: varargno = 6 if varargno: variant = getparam(t, str(varargno)) if re.search("^[abc]", variant): variant = "/" + variant if getparam(t, str(varargno + 1)) or getparam( t, str(varargno + 2)) or getparam( t, str(varargno + 3)): t.add(str(varargno), "") else: rmparam(t, str(varargno)) conjtype = conjtype + variant notes.append("ru-conj-* -> ru-conj, moving params up by one%s" % (variant and " (and move variant spec)" or "")) seenval = False for i in xrange(20, 0, -1): val = getparam(t, str(i)) if val: seenval = True if seenval: t.add(str(i + 1), val) t.add("1", conjtype) blib.sort_params(t) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) parsed = blib.parse(page) headword_template = None see_template = None for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: if headword_template: pagemsg("WARNING: Multiple headword templates, skipping") return headword_template = t if unicode(t.name) in ["ru-decl-noun-see"]: if see_template: pagemsg("WARNING: Multiple ru-decl-noun-see templates, skipping") return see_template = t if not headword_template: pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping") return if not see_template: pagemsg("WARNING: No ru-decl-noun-see templates, skipping") return del see_template.params[:] for param in headword_template.params: see_template.add(param.name, param.value) see_template.name = "ru-noun-table" if unicode(headword_template.name) == "ru-proper noun+": # Things are trickier for proper nouns because they default to n=sg, whereas # ru-noun-table defaults to n=both. We have to expand both templates and # fetch the value of n, and set it in ru-noun-table if not the same. # 1. Generate args for headword proper-noun template, using |ndef=sg # because ru-proper noun+ defaults to sg and ru-generate-noun-args # would otherwise default to both. headword_generate_template = re.sub(r"^\{\{ru-proper noun\+", "{{ru-generate-noun-args", unicode(headword_template)) headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}", headword_generate_template) headword_generate_result = expand_text(headword_generate_template) if not headword_generate_result: pagemsg("WARNING: Error generating ru-proper noun+ args") return None # 2. Fetch actual value of n. headword_args = ru.split_generate_args(headword_generate_result) headword_n = headword_args["n"] # 3. If sg, we always need to set n=sg explicitly in ru-noun-table. if headword_n == "s": see_template.add("n", "sg") # 4. If pl, leave alone, since both will default to plural only if the # lemma is pl, else n=pl needs to be set for both. elif headword_n == "p": pass # 5. If both, n=both had to have been set explicitly in the headword, # but it's the default in ru-noun-table unless the lemma is plural. # So remove n=both, generate the arguments, and see if the actual # value of args.n is b (for "both"); if not, set n=both. else: assert headword_n == "b" rmparam(see_template, "n") see_generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(see_template)) see_generate_result = expand_text(see_generate_template) if not see_generate_result: pagemsg("WARNING: Error generating ru-noun-table args") return None see_args = ru.split_generate_args(see_generate_result) if see_args["n"] != "b": see_template.add("n", "both") comment = "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode(headword_template.name) if save: pagemsg("Saving with comment = %s" % comment) page.text = unicode(parsed) page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'conjugation of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru": origt = unicode(t) t.name = "inflection of" newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted 'conjugation of' to 'inflection of'") sections[j] = unicode(parsed) # Try to split 'inflection of' containing 'present or future' into two # defns newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$", r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}", sections[j], 0, re.M) if newsec != sections[j]: notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'") sections[j] = newsec # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv', # 'impr' to 'imp' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": for frm, to in [ ("indc", "ind"), ("indicative", "ind"), ("futr", "fut"), ("future", "fut"), ("impr", "imp"), ("imperative", "imp"), ("perfective", "pfv"), ("(perfective)", "pfv"), ("imperfective", "impfv"), ("(imperfective)", "impfv"), ("singular", "s"), ("(singular)", "s"), ("plural", "p"), ("(plural)", "p"), ("masculine", "m"), ("(masculine)", "m"), ("feminine", "f"), ("(feminine)", "f"), ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"), ]: origt = unicode(t) for i in xrange(3,20): val = getparam(t, str(i)) if val == frm: t.add(str(i), to) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s' form code to '%s'" % (frm, to)) sections[j] = unicode(parsed) # Remove blank form codes and canonicalize position of lang=, tr= parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": origt = unicode(t) # Fetch the numbered params starting with 3, skipping blank ones numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) # Fetch param 1 and param 2, and non-numbered params except lang= # and nocat=. param1 = getparam(t, "1") param2 = getparam(t, "2") tr = getparam(t, "tr") nocat = getparam(t, "nocat") non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back lang, param 1, param 2, tr, then the replacements for the # higher numbered params, then the non-numbered params. t.add("lang", "ru") t.add("1", param1) t.add("2", param2) if tr: t.add("tr", tr) for i, param in enumerate(numbered_params): t.add(str(i+3), param) for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("removed any blank form codes and maybe rearranged lang=, tr=") if nocat: notes.append("removed nocat=") sections[j] = unicode(parsed) # Try to canonicalize 'inflection of' involving the imperative, # present, future parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) numparamset = set(numbered_params) canon_params = [] while True: if numparamset == {'s', 'pfv', 'imp'}: canon_params = ['2', 's', 'pfv', 'imp'] elif numparamset == {'s', 'impfv', 'imp'}: canon_params = ['2', 's', 'impfv', 'imp'] elif numparamset == {'s', 'imp'}: canon_params = ['2', 's', 'imp'] elif numparamset == {'p', 'pfv', 'imp'}: canon_params = ['2', 'p', 'pfv', 'imp'] elif numparamset == {'p', 'impfv', 'imp'}: canon_params = ['2', 'p', 'impfv', 'imp'] elif numparamset == {'p', 'imp'}: canon_params = ['2', 'p', 'imp'] elif numparamset == {'m', 's', 'past'}: canon_params = ['m', 's', 'past', 'ind'] elif numparamset == {'f', 's', 'past'}: canon_params = ['f', 's', 'past', 'ind'] elif numparamset == {'n', 's', 'past'}: canon_params = ['n', 's', 'past', 'ind'] elif numparamset == {'p', 'past'}: canon_params = ['p', 'past', 'ind'] else: m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr) if m: canon_params = [m.group(1), m.group(2), m.group(3), "ind"] break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified participial inflection def add_participle_inflection_of(m): prefix = m.group(1) tense = m.group(2).lower() if tense == "present": tense = "pres" voice = m.group(3).lower() if voice == "active": voice = "act" elif voice == "passive": voice = "pass" elif voice == "adverbial": voice = "adv" lemma = m.group(4) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice)) return retval newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified past inflection def add_past_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "") pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or "")) return retval newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified imperative inflection def add_imper_inflection_of(m): prefix = m.group(1) number = {"singular":"s", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for 2/%s/imp" % number) return retval newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified finite pres/fut inflection def add_pres_fut_inflection_of(m): prefix = m.group(1) person = m.group(2)[0] number = {"singular":"s", "plural":"p"}[m.group(3).lower()] tense = {"present":"pres", "future":"fut"}[m.group(4).lower()] lemma = m.group(5) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense)) return retval newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of, sections[j], 0, re.I) sections[j] = newsec new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, romaji_to_keep): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]: origt = unicode(t) # Remove old script code p1 = getparam(t, "1") if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]: pagemsg("Removing 1=%s: %s" % (p1, unicode(t))) notes.append("remove 1=%s from %s" % (p1, tname)) rmparam(t, "1") for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): param.name = str(int(pname) - 1) param.showkey = False # Convert hira= and/or kata= to numbered param. The complexity is # from ensuring that the numbered params always go before the # non-numbered ones. if t.has("hira") or t.has("kata"): # Fetch the numbered and non-numbered params, skipping blank # numbered ones and converting hira and kata to numbered numbered_params = [] non_numbered_params = [] for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): val = unicode(param.value) if val: numbered_params.append(val) elif pname not in ["hira", "kata"]: non_numbered_params.append((pname, param.value)) hira = getparam(t, "hira") if hira: numbered_params.append(hira) pagemsg("Moving hira=%s to %s=: %s" % (hira, len(numbered_params), unicode(t))) notes.append("move hira= to %s= in %s" % (len(numbered_params), tname)) kata = getparam(t, "kata") if kata: numbered_params.append(kata) pagemsg("Moving kata=%s to %s=: %s" % (kata, len(numbered_params), unicode(t))) notes.append("move kata= to %s= in %s" % (len(numbered_params), tname)) del t.params[:] # Put back numbered params, then non-numbered params. for i, param in enumerate(numbered_params): t.add(str(i+1), param) for name, value in non_numbered_params: t.add(name, value) # Remove rom= if not in list of pages to keep rom= if t.has("rom"): if pagetitle in romaji_to_keep: pagemsg("Keeping rom=%s because in romaji_to_keep: %s" % ( getparam(t, "rom"), unicode(t))) else: pagemsg("Removing rom=%s: %s" % (getparam(t, "rom"), unicode(t))) rmparam(t, "rom") notes.append("remove rom= from %s" % tname) # Remove hidx= if t.has("hidx"): pagemsg("Removing hidx=%s: %s" % (getparam(t, "hidx"), unicode(t))) rmparam(t, "hidx") notes.append("remove hidx= from %s" % tname) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if unicode(t.name) == "fr-adj": g = getparam(t, "1") if g and g != "mf": pagemsg("WARNING: Strange value 1=%s, removing: %s" % (g, unicode(t))) rmparam(t, "1") notes.append("remove bogus 1=%s" % g) g = None inv = getparam(t, "inv") if inv: if inv not in ["y", "yes", "1"]: pagemsg("WARNING: Strange value inv=%s: %s" % (inv, unicode(t))) if (getparam(t, "1") or getparam(t, "f") or getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")): pagemsg("WARNING: Found extraneous params with inv=: %s" % unicode(t)) continue if (getparam(t, "f2") or getparam(t, "mp2") or getparam(t, "fp2") or getparam(t, "p2")): pagemsg("Skipping multiple feminines or plurals: %s" % unicode(t)) continue expected_mp = (pagetitle if re.search("[sx]$", pagetitle) else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al") else pagetitle + "s") if getparam(t, "mp") == expected_mp: rmparam(t, "mp") notes.append("remove redundant mp=") expected_fem = (pagetitle if pagetitle.endswith("e") else pagetitle + "ne" if pagetitle.endswith("en") else re.sub("er$", u"ère", pagetitle) if pagetitle.endswith("er") else pagetitle + "le" if pagetitle.endswith("el") else pagetitle + "ne" if pagetitle.endswith("on") else pagetitle + "te" if pagetitle.endswith("et") else pagetitle + "e" if pagetitle.endswith("ieur") else re.sub("teur$", "trice", pagetitle) if pagetitle.endswith("teur") else re.sub("eur$", "euse", pagetitle) if pagetitle.endswith("eur") else re.sub("eux$", "euse", pagetitle) if pagetitle.endswith("eux") else re.sub("if$", "ive", pagetitle) if pagetitle.endswith("if") else re.sub("c$", "que", pagetitle) if pagetitle.endswith("c") else pagetitle + "e") if re.search("(el|on|et|[^i]eur|eux|if|c)$", pagetitle) and not getparam(t, "f") and g != "mf": pagemsg("WARNING: Found suffix -el/-on/-et/-[^i]eur/-eux/-if/-c and no f= or 1=mf: %s" % unicode(t)) if getparam(t, "f") == expected_fem: rmparam(t, "f") notes.append("remove redundant f=") fem = getparam(t, "f") or expected_fem if not fem.endswith("e"): if not getparam(t, "fp"): pagemsg("WARNING: Found f=%s not ending with -e and no fp=: %s" % (fem, unicode(t))) continue expected_fp = fem + "s" if getparam(t, "fp") == expected_fp: rmparam(t, "fp") notes.append("remove redundant fp=") if getparam(t, "fp") and not getparam(t, "f"): pagemsg("WARNING: Found fp=%s and no f=: %s" % (getparam(t, "fp"), unicode(t))) continue if getparam(t, "fp") == fem: pagemsg("WARNING: Found fp=%s same as fem=%s: %s" % (getparam(t, "fp"), fem, unicode(t))) continue if pagetitle.endswith("e") and not getparam(t, "f") and not getparam(t, "fp"): if g == "mf": rmparam(t, "1") notes.append("remove redundant 1=mf") g = "mf" if g == "mf": f = getparam(t, "f") if f: pagemsg("WARNING: Found f=%s and 1=mf: %s" % (f, unicode(t))) mp = getparam(t, "mp") if mp: pagemsg("WARNING: Found mp=%s and 1=mf: %s" % (mp, unicode(t))) fp = getparam(t, "fp") if fp: pagemsg("WARNING: Found fp=%s and 1=mf: %s" % (fp, unicode(t))) if f or mp or fp: continue expected_p = (pagetitle if re.search("[sx]$", pagetitle) else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al") else pagetitle + "s") if getparam(t, "p") == expected_p: rmparam(t, "p") notes.append("remove redundant p=") elif getparam(t, "p"): pagemsg("WARNING: Found unexpected p=%s: %s" % (getparam(t, "p"), unicode(t))) if not re.search("[ -]", pagetitle) and (getparam(t, "f") or getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")): pagemsg("Found remaining explicit feminine or plural in single-word base form: %s" % unicode(t)) newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") manual_ppp_forms = ["past_pasv_part", "past_pasv_part2", "past_pasv_part3", "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4"] text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname == "ru-conj": manual_ppps = [] for form in manual_ppp_forms: ppp = getparam(t, form) if ppp and ppp != "-": manual_ppps.append(ppp) if not manual_ppps: continue if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue curvariant = getparam(t, "2") if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant: pagemsg("WARNING: Found both manual PPP and PPP variant, something wrong: %s" % unicode(t)) continue t2 = blib.parse_text(unicode(t)).filter_templates()[0] for form in manual_ppp_forms: rmparam(t2, form) variants_to_try = ["+p"] if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]): variants_to_try.append(u"+pё") if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]: variants_to_try.append(u"+pжд") notsamemsgs = [] for variant in variants_to_try: t2.add("2", curvariant + variant) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t2)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = rulib.split_generate_args(result) if "past_pasv_part" not in args: pagemsg("WARNING: Something wrong, no past passive participle generated: %s" % unicode(t)) continue auto_ppps = [] for form in manual_ppp_forms: if form in args: for ppp in re.split(",", args[form]): if ppp and ppp != "-": auto_ppps.append(ppp) if manual_ppps == auto_ppps: pagemsg("Manual PPP's %s same as auto-generated PPP's, switching to auto" % ",".join(manual_ppps)) for form in manual_ppp_forms: rmparam(t, form) t.add("2", curvariant + variant) notes.append("replaced manual PPP's with variant %s" % variant) break else: notsamemsgs.append("WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s" % (",".join(manual_ppps), ",".join(auto_ppps), unicode(t))) else: # no break in for loop for m in notsamemsgs: pagemsg(m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, fix_missing_plurals): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name == "head" and getparam(t, "1") == "fr": headtype = getparam(t, "2") fixed_plural_warning = False if headtype == "noun": head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") plural = "" if getparam(t, "3") == "plural": plural = getparam(t, "4") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort" ] or plural and pname in ["3", "4"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg( "WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue found_feminine_noun = False if g == "f" and not g2 and not plural: for tt in parsed.filter_templates(): if (unicode(tt.name) == "feminine noun of" and getparam(tt, "lang") == "fr"): found_feminine_noun = True if found_feminine_noun: pagemsg("Found 'feminine noun of', assuming countable") elif g not in ["m-p", "f-p"] and not plural: if fix_missing_plurals: pagemsg( "WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW" % unicode(t)) fixed_plural_warning = True else: pagemsg("WARNING: No plural given in %s, skipping" % unicode(t)) continue rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) if plural: t.add("2", plural) elif headtype in ["proper noun", "proper nouns"]: head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") remove_3 = False if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]: g = getparam(t, "3") remove_3 = True unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort" ] or remove_3 and pname in ["3"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg( "WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-proper noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) elif headtype in ["adjective", "adjectives"]: if getparam(t, "3") in ["invariable", "invariant"]: params = dict( (unicode(p.name), unicode(p.value)) for p in t.params) del params["1"] del params["2"] del params["3"] if getparam(t, "g") == "m" and getparam(t, "g2") == "f": del params["g"] del params["g2"] if not params: rmparam(t, "g2") rmparam(t, "g") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") t.name = "fr-adj" t.add("inv", "y") else: pagemsg( "WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) else: pagemsg( "WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) elif headtype in [ "adjective form", "verb form", "verb forms", "interjection", "preposition", "prefix", "prefixes", "suffix", "suffixes" ]: headtype_supports_g = headtype in [ "adjective form", "suffix", "suffixes" ] head = getparam(t, "head") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "sort" ] or headtype_supports_g and pname == "g": pass else: unrecognized_params = True break if unrecognized_params: pagemsg( "WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue rmparam(t, "sort") rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") t.name = ( "fr-adj-form" if headtype == "adjective form" else "fr-verb-form" if headtype in ["verb form", "verb forms"] else "fr-intj" if headtype == "interjection" else "fr-prep" if headtype == "preposition" else "fr-prefix" if headtype in ["prefix", "prefixes"] else "fr-suffix" # if headtype in ["suffix", "suffixes"] ) if head: t.add("head", head) newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append( "replaced {{head|fr|%s}} with {{%s}}%s" % (headtype, unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else "")) return unicode(parsed), notes