def generate_adj_forms(template, errandpagemsg, expand_text, return_raw=False, include_linked=False): if template.startswith("{{la-adecl|"): generate_template = re.sub(r"^\{\{la-adecl\|", "{{la-generate-adj-forms|", template) else: errandpagemsg( "Template %s not a recognized adjective declension template" % template) return None result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None args = blib.split_generate_args(result) if not include_linked: args = { k: v for k, v in args.iteritems() if not k.startswith("linked_") } # Add missing feminine forms if needed augmented_args = {} for key, form in args.iteritems(): augmented_args[key] = form if key.endswith("_m"): equiv_fem = key[:-2] + "_f" if equiv_fem not in args: augmented_args[equiv_fem] = form return augmented_args
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") parsed = blib.parse(page) for t in parsed.filter_templates(): if unicode(t.name) in ["ru-conj", "ru-conj-old"] and getparam( t, "1").startswith("pf"): if tname == "ru-conj": tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) else: tempcall = re.sub(r"\{\{ru-conj-old", "{{ru-generate-verb-forms|old=y", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) for base in ["past_pasv_part", "ppp"]: for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]: val = getparam(t, base + i) if val and val != "-": val = re.sub("//.*", "", val) pagemsg( "Found perfective past passive participle: %s" % val)
def generate_noun_forms(template, errandpagemsg, expand_text, return_raw=False, include_linked=False): if template.startswith("{{la-ndecl|"): generate_template = re.sub(r"^\{\{la-ndecl\|", "{{la-generate-noun-forms|", template) else: errandpagemsg("Template %s not a recognized noun declension template" % template) return None result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None args = blib.split_generate_args(result) if not include_linked: args = { k: v for k, v in args.iteritems() if not k.startswith("linked_") } return args
def find_noun(pagename, pagemsg, errandpagemsg, expand_text): section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg) if not section: return None if "==Etymology" in section: return -1 parsed = blib.parse_text(section) nouns = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun+": generate_template = re.sub(r"^\{\{ru-noun\+", "{{ru-generate-noun-forms", unicode(t)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun forms") return None args = blib.split_generate_args(generate_result) lemma = args["nom_sg"] if "nom_sg" in args else args["nom_pl"] if "," in lemma: pagemsg("WARNING: Lemma has multiple forms: %s" % lemma) return None if lemma not in nouns: nouns.append(lemma) if len(nouns) > 1: pagemsg("WARNING: Multiple lemmas for noun: %s" % ",".join(nouns)) if not nouns: return None return nouns[0]
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) newarg1 = None if tn == "de-conj": generate_template = re.sub(r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pagemsg("For %s, class=%s" % (unicode(t), forms["class"])) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def generate_old_adj_forms(template, errandpagemsg, expand_text, return_raw=False, include_linked=False): def generate_adj_forms_prefix(m): decl_suffix_to_decltype = { 'decl-1&2': '1&2', 'decl-3rd-1E': '3-1', 'decl-3rd-2E': '3-2', 'decl-3rd-3E': '3-3', 'decl-3rd-comp': '3-C', 'decl-3rd-part': '3-P', 'adecl-1st': '1-1', 'adecl-2nd': '2-2', 'decl-irreg': 'irreg', } if m.group(1) in decl_suffix_to_decltype: return "{{la-generate-adj-forms|decltype=%s|" % ( decl_suffix_to_decltype[m.group(1)]) return m.group(0) if template.startswith("{{la-adecl|"): generate_template = re.sub(r"^\{\{la-adecl\|", "{{la-generate-adj-forms|", template) else: generate_template = re.sub(r"^\{\{la-(.*?)\|", generate_adj_forms_prefix, template) if not generate_template.startswith("{{la-generate-adj-forms|"): errandpagemsg( "Template %s not a recognized adjective declension template" % template) return None result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None args = blib.split_generate_args(result) if not include_linked: args = { k: v for k, v in args.iteritems() if not k.startswith("linked_") } # Add missing feminine forms if needed augmented_args = {} for key, form in args.iteritems(): augmented_args[key] = form if key.endswith("_m"): equiv_fem = key[:-2] + "_f" if equiv_fem not in args: augmented_args[equiv_fem] = form return augmented_args
def process_decl(index, pagetitle, decl, forms, save, verbose): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if decl.startswith("{{ru-conj|"): tempcall = re.sub(r"^\{\{ru-conj", "{{ru-generate-verb-forms", decl) elif decl.startswith("{{ru-noun-table"): tempcall = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", decl) else: pagemsg("WARNING: Unrecognized decl template, skipping: %s" % decl) return result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") return args = blib.split_generate_args(result) for form in forms: if form in args: for formpagename in re.split(",", args[form]): formpagename = re.sub("//.*$", "", formpagename) formpagename = rulib.remove_accents(formpagename) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) else: skip_form = False for m in re.finditer(r"^==([^=]*?)==$", text, re.M): if m.group(1) != "Russian": pagemsg("WARNING: Found entry for non-Russian language %s, skipping form %s" % (m.group(1), formpagename)) skip_form = True if not skip_form: comment = "Delete erroneously created form of %s" % pagetitle if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment))
def generate_new_forms(): new_generate_template = re.sub( r"^\{\{es-conj", "{{User:Benwing2/es-generate-verb-forms", newt) new_result = expand_text(new_generate_template) if not new_result: return None args = blib.split_generate_args(new_result) args = { k: v for k, v in args.iteritems() if not k.startswith("neg_") and k != "infinitive_linked" } args = {k: sort_multiple(v) for k, v in args.iteritems()} return args
def fetch_noun_args(t, expand_text, forms_only=False): generate_template = ("ru-generate-noun-forms" if forms_only else "ru-generate-noun-args") if unicode(t.name) == "ru-noun+": generate_template = re.sub(r"^\{\{ru-noun\+", "{{%s" % generate_template, unicode(t)) else: generate_template = re.sub(r"^\{\{ru-proper noun\+", "{{%s|ndef=sg" % generate_template, unicode(t)) generate_result = expand_text(generate_template) if not generate_result: return None return blib.split_generate_args(generate_result)
def new_generate_noun_forms(template, errandpagemsg, expand_text, return_raw=False, include_props=False): assert template.startswith("{{la-ndecl|") if include_props: generate_template = re.sub(r"^\{\{la-ndecl\|", "{{User:Benwing2/la-new-generate-noun-props|", template) else: generate_template = re.sub(r"^\{\{la-ndecl\|", "{{User:Benwing2/la-new-generate-noun-forms|", template) result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None return blib.split_generate_args(result)
def snarf_noun_accents_and_forms(noun, orig_pagemsg): global args pagetitle = bglib.remove_accents(noun) if pagetitle in nouns_to_accents_and_forms: return nouns_to_accents_and_forms[pagetitle] def pagemsg(txt): orig_pagemsg("Noun %s: %s" % (noun, txt)) page = pywikibot.Page(site, pagetitle) parsed = blib.parse(page) lemma = None for t in parsed.filter_templates(): if tname(t) in ["bg-noun", "bg-proper noun"]: if lemma: pagemsg("WARNING: Saw two {{bg-noun}} invocations without intervening {{bg-ndecl}}: %s" % unicode(t)) lemma = getparam(t, "1") if not lemma: pagemsg("WARNING: Missing headword in noun: %s" % unicode(t)) continue if bglib.needs_accents(lemma): pagemsg("WARNING: Noun %s missing an accent: %s" % (lemma, unicode(t))) lemma = False continue if tname(t) == "bg-ndecl": if lemma is False: pagemsg("WARNING: Skipping %s because noun missing an accent" % unicode(t)) continue if lemma is None: pagemsg("WARNING: Skipping %s because no preceding {{bg-noun}}" % unicode(t)) continue if pagetitle in nouns_to_accents_and_forms: pagemsg("WARNING: Saw two {{bg-ndecl}} on the same page: %s" % unicode(t)) nouns_to_accents_and_forms[pagetitle] = (None, None) return (None, None) generate_template = re.sub(r"^\{\{bg-ndecl\|", "{{bg-generate-noun-forms|", unicode(t)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) generate_result = expand_text(generate_template) if not generate_result: nouns_to_accents_and_forms[pagetitle] = (None, None) return (None, None) nouns_to_accents_and_forms[pagetitle] = (lemma, blib.split_generate_args(generate_result)) if pagetitle in nouns_to_accents_and_forms: return nouns_to_accents_and_forms[pagetitle] pagemsg("WARNING: Couldn't find both lemma and declension") nouns_to_accents_and_forms[pagetitle] = (None, None) return (None, None)
def generate_verb_forms(template, errandpagemsg, expand_text, return_raw=False, include_linked=False, include_props=False, add_sync_forms=False): if template.startswith("{{la-conj|"): if include_props: generate_template = re.sub(r"^\{\{la-conj\|", "{{la-generate-verb-props|", template) else: generate_template = re.sub(r"^\{\{la-conj\|", "{{la-generate-verb-forms|", template) else: errandpagemsg("Template %s not a recognized conjugation template" % template) return None result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None args = blib.split_generate_args(result) if not include_linked: args = { k: v for k, v in args.iteritems() if not k.startswith("linked_") } def augment_with_sync_forms(forms): forms = forms.split(",") augmented_forms = [] for form in forms: augmented_forms.append(form) if re.search( u"(vi(stī|stis)|vērunt|ver(am|ās|at|āmus|ātis|ant|ō|im|[iī]s|it|[iī]mus|[iī]tis|int)|viss(e|em|ēs|et|ēmus|ētis|ent))$", form): augmented_forms.append(re.sub(u"^(.*)v[ieē]", r"\1", form)) return ",".join(augmented_forms) if add_sync_forms: args = {k: augment_with_sync_forms(v) for k, v in args.iteritems()} return args
def generate_old_noun_forms(template, errandpagemsg, expand_text, return_raw=False, include_linked=False): def generate_noun_forms_prefix(m): if m.group(1) in la_noun_decl_suffix_to_decltype: declspec, stem_suffix, pl_suffix, to_auto = la_noun_decl_suffix_to_decltype[m.group(1)] if type(declspec) is not tuple: declspec = (declspec,) decl = declspec[0] if len(declspec) == 1: decltype = "" num = "" else: decltype = "|decl_type=%s" % declspec[1] if len(declspec) == 2: num = "" else: num = "|num=%s" % declspec[2] return "{{la-generate-noun-forms|decl=%s%s%s|" % ( decl, decltype, num ) return m.group(0) if template.startswith("{{la-ndecl|"): generate_template = re.sub(r"^\{\{la-ndecl\|", "{{la-generate-noun-forms|", template) else: generate_template = re.sub(r"^\{\{la-decl-(.*?)\|", generate_noun_forms_prefix, template) if not generate_template.startswith("{{la-generate-noun-forms|"): errandpagemsg("Template %s not a recognized noun declension template" % template) return None result = expand_text(generate_template) if return_raw: return None if result is False else result if not result: errandpagemsg("WARNING: Error generating forms, skipping") return None args = blib.split_generate_args(result) if not include_linked: args = {k: v for k, v in args.iteritems() if not k.startswith("linked_")} return args
for index, decl in blib.iter_items(yield_decls(), start, end): module = uk if args.lang == "uk" else be if decl.startswith("(("): m = re.search(r"^\(\((.*)\)\)$", decl) subdecls = m.group(1).split(",") decl_for_page = subdecls[0] else: decl_for_page = decl m = re.search(r"^(.+?)<.*>$", decl_for_page) if not m: msg("WARNING: Can't extract lemma from decl: %s" % decl) pagename = "UNKNOWN" else: pagename = module.remove_accents(blib.remove_links(m.group(1))) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, args.verbose) tempcall = "{{%s-generate-noun-forms|%s}}" % (args.lang, decl) result = expand_text(tempcall) if not result: continue predforms = blib.split_generate_args(result) lemma = predforms["nom_s"] if "nom_s" in predforms else predforms["nom_p"] real_pagename = re.sub(",.*", "", module.remove_accents(blib.remove_links(lemma))) page = pywikibot.Page(site, real_pagename) def do_replace_decl(page, index, parsed): return replace_decl(page, index, parsed, decl, predforms) blib.do_edit(page, index, do_replace_decl, save=args.save, verbose=args.verbose, diff=args.diff)
def compare_new_and_old_templates(origt, newt, pagetitle, pagemsg, errandpagemsg): global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) old_generate_template = re.sub(r"\}\}$", "|generate_forms=1}}", unicode(origt)) old_result = expand_text(old_generate_template) if not old_result: return None new_generate_template = re.sub(r"^\{\{de-conj\|", "{{User:Benwing2/de-generate-verb-forms|", unicode(newt)) new_result = expand_text(new_generate_template) if not new_result: return None def remove_forms_in(forms, regex): forms = forms.split(",") forms = [form for form in forms if not re.search(regex, form)] return ",".join(forms) newarg1 = re.sub("<.*>", "", getparam(newt, "1")) or pagetitle if old_result is None: errandpagemsg("WARNING: Error generating old forms, can't compare") return False old_forms = blib.split_generate_args(old_result) if not re.search("[._]", newarg1): old_forms = {k: v for k, v in old_forms.items() if k != "zu_infinitive" and not k.startswith("subc")} old_forms = {k: v.replace(" ", " ").replace(" ", " ").strip().replace(" ,", ",") for k, v in old_forms.items()} if "_" in newarg1 and "zu_infinitive" in old_forms: # Fix bug in old form zu-infinitive old_forms["zu_infinitive"] = old_forms["zu_infinitive"].replace(" zu", " zu ") if "imp_2s" in old_forms and re.search("[dt]en$", newarg1): # Old code leaves out imperative without -e forms = old_forms["imp_2s"].split(",") if not [x for x in forms if not re.search("e($| )", x)]: nforms = [] for form in forms: if re.search("e($| )", form): nforms.append(re.sub("e($| )", r"\1", form)) nforms.append(form) old_forms["imp_2s"] = ",".join(nforms) if new_result is None: errandpagemsg("WARNING: Error generating new forms, can't compare") return False new_forms = blib.split_generate_args(new_result) if "subii_2s" in new_forms: # New code generates subii 2s in both -est and -st; old only in -est new_forms["subii_2s"] = remove_forms_in(new_forms["subii_2s"], u"^[^ ]*([^e]|ie)[sxßz]t($| )") if "subii_2p" in new_forms: # New code generates subii 2p in both -et and -t; old only in -et new_forms["subii_2p"] = remove_forms_in(new_forms["subii_2p"], "^[^ ]*[^e]t($| )") if "subc_subii_2s" in new_forms: # New code generates subii 2s in both -est and -st; old only in -est new_forms["subc_subii_2s"] = remove_forms_in(new_forms["subc_subii_2s"], u"([^e]|ie)[sxßz]t$") if "subc_subii_2p" in new_forms: # New code generates subii 2p in both -et and -t; old only in -et new_forms["subc_subii_2p"] = remove_forms_in(new_forms["subc_subii_2p"], "[^e]t$") #if "perf_sub_2s" in new_forms and "seiest" in new_forms["perf_sub_2s"] and not re.search("e[rl]n$", newarg1): # # New code generates perf sub 2s in both seist and seiest; old only in seist # new_forms["perf_sub_2s"] = remove_forms_in(new_forms["perf_sub_2s"], "seiest") if re.search(u"[sxzß]en$", newarg1): if "pret_2s" in new_forms: # New code generates pret 2s for -sen verbs in both -sest and -st; old only in -st new_forms["pret_2s"] = remove_forms_in(new_forms["pret_2s"], u"^[^ ]*[sxzß]est($| )") if "subc_pret_2s" in new_forms: # New code generates pret 2s for -sen verbs in both -sest and -st; old only in -st new_forms["subc_pret_2s"] = remove_forms_in(new_forms["subc_pret_2s"], u"[sxzß]est$") if re.search(u"[td]en$", newarg1): if "pret_2s" in new_forms: # New code generates pret 2s for -ten verbs in both -test and -tst; old only in -test new_forms["pret_2s"] = remove_forms_in(new_forms["pret_2s"], u"^[^ ]*[td]st($| )") if "subc_pret_2s" in new_forms: # New code generates pret 2s for -sen verbs in both -test and -tst; old only in -test new_forms["subc_pret_2s"] = remove_forms_in(new_forms["subc_pret_2s"], u"[td]st$") for form in set(old_forms.keys() + new_forms.keys()): if form not in new_forms: pagemsg("WARNING: for original %s and new %s, form %s=%s in old forms but missing in new forms" % ( unicode(origt), unicode(newt), form, old_forms[form])) return False if form not in old_forms: pagemsg("WARNING: for original %s and new %s, form %s=%s in new forms but missing in old forms" % ( unicode(origt), unicode(newt), form, new_forms[form])) return False if set(new_forms[form].split(",")) != set(old_forms[form].split(",")): pagemsg("WARNING: for original %s and new %s, form %s=%s in old forms but =%s in new forms" % ( unicode(origt), unicode(newt), form, old_forms[form], new_forms[form])) return False pagemsg("%s and %s have same forms" % (unicode(origt), unicode(newt))) return True
def process_page(index, page, direc, delete_bad, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] direc = direc.replace("3oa", u"3°a") for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj"]: conjtype = getparam(t, "1") if not conjtype.startswith("3olda"): continue if conjtype.startswith("3olda") and conjtype != "3olda": pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t)) continue tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue oldargs = blib.split_generate_args(result) rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") t.add("1", direc) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue if delete_bad: newargs = blib.split_generate_args(result) for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short", "past_f_short", "past_n_short", "past_pl_short"]: oldforms = re.split(",", oldargs[form]) if form in oldargs else [] newforms = re.split(",", newargs[form]) if form in newargs else [] for oldform in oldforms: if oldform not in newforms: formpagename = rulib.remove_accents(oldform) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) elif "----" in text: pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename) else: numinfls = len(re.findall(r"\{\{inflection of\|", text)) if numinfls < 1: pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename) elif numinfls > 1: pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename) else: comment = "Delete erroneously created long form of %s" % pagetitle pagemsg("Existing text for form %s: [[%s]]" % ( formpagename, text)) if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment)) notes.append("fix 3olda -> %s" % direc) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") manual_ppp_forms = [ "past_pasv_part", "past_pasv_part2", "past_pasv_part3", "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4" ] text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname == "ru-conj": manual_ppps = [] for form in manual_ppp_forms: ppp = getparam(t, form) if ppp and ppp != "-": manual_ppps.append(ppp) if not manual_ppps: continue if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue curvariant = getparam(t, "2") if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant: pagemsg( "WARNING: Found both manual PPP and PPP variant, something wrong: %s" % unicode(t)) continue t2 = blib.parse_text(unicode(t)).filter_templates()[0] for form in manual_ppp_forms: rmparam(t2, form) variants_to_try = ["+p"] if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]): variants_to_try.append(u"+pё") if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]: variants_to_try.append(u"+pжд") notsamemsgs = [] for variant in variants_to_try: t2.add("2", curvariant + variant) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t2)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) if "past_pasv_part" not in args: pagemsg( "WARNING: Something wrong, no past passive participle generated: %s" % unicode(t)) continue auto_ppps = [] for form in manual_ppp_forms: if form in args: for ppp in re.split(",", args[form]): if ppp and ppp != "-": auto_ppps.append(ppp) if manual_ppps == auto_ppps: pagemsg( "Manual PPP's %s same as auto-generated PPP's, switching to auto" % ",".join(manual_ppps)) for form in manual_ppp_forms: rmparam(t, form) t.add("2", curvariant + variant) notes.append("replaced manual PPP's with variant %s" % variant) break else: notsamemsgs.append( "WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s" % (",".join(manual_ppps), ",".join(auto_ppps), unicode(t))) else: # no break in for loop for m in notsamemsgs: pagemsg(m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(page, index, do_fix): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-conj", "ru-conj-old"]: if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") if tname == "ru-conj": tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) else: tempcall = re.sub(r"\{\{ru-conj-old", "{{ru-generate-verb-forms|old=y", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) for base in ["past_pasv_part", "ppp"]: forms_to_remove = [] if args[base] == "-": continue for form in re.split(",", args[base]): origform = form form = re.sub("//.*", "", form) fix_form = False if not re.search(ur"([аяеё]́?нный|тый)$", form): pagemsg( "WARNING: Past passive participle doesn't end correctly: %s" % form) fix_form = True unstressed_page = rulib.make_unstressed_ru(pagetitle) unstressed_form = rulib.make_unstressed_ru(form) warned = False if unstressed_form[0] != unstressed_page[0]: pagemsg( "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s" % form) warned = True fix_form = True if form.endswith(u"нный"): if pagetitle.endswith(u"ать"): good_ending = u"анный" elif pagetitle.endswith(u"ять"): good_ending = u"янный" else: good_ending = u"енный" if not unstressed_form.endswith(good_ending): pagemsg( "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s" % form) warned = True fix_form = True if not warned: correct_form = form_ppp(conjtype, pagetitle, args) if correct_form and unstressed_form != correct_form: pagemsg( "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s" % (unstressed_form, correct_form)) fix_form = True if fix_form: forms_to_remove.append(origform) if forms_to_remove and do_fix: curvals = [] for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]: val = getparam(t, base + i) if val: curvals.append(val) newvals = [x for x in curvals if x not in forms_to_remove] if len(curvals) - len(newvals) != len(forms_to_remove): pagemsg( "WARNING: Something wrong, couldn't remove all PPP forms %s" % ",".join(forms_to_remove)) curindex = 1 origt = unicode(t) for newval in newvals: t.add(base + ("" if curindex == 1 else str(curindex)), newval) curindex += 1 for i in xrange(curindex, 10): rmparam(t, base + ("" if i == 1 else str(i))) pagemsg("Replacing %s with %s" % (origt, unicode(t))) notes.append("removed bad past pasv part(s) %s" % ",".join(forms_to_remove))
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) saw_noun = None saw_proper_noun = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-noun": if saw_noun: pagemsg( "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_noun), unicode(t))) return saw_noun = t elif tn == "la-proper noun": if saw_proper_noun: pagemsg( "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_proper_noun), unicode(t))) return saw_proper_noun = t if saw_noun and saw_proper_noun: pagemsg( "WARNING: Saw both noun and proper noun, can't correct header/headword" ) return if not saw_noun and not saw_proper_noun: pagemsg( "WARNING: Saw neither noun nor proper noun, can't correct header/headword" ) return pos = "pn" if saw_proper_noun else "n" ht = saw_proper_noun or saw_noun if getparam(ht, "indecl"): pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht)) return generate_template = blib.parse_text(unicode(ht)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") return tempargs = blib.split_generate_args(result) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in tempargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for index, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, index, parsed): return process_form(page, index, slot, form, pos) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), index, handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) headt = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["de-verb-old", "de-verb-strong", "de-verb-weak" ] or tn == "head" and getparam(t, "1") == "de" and getparam( t, "2") == "verb": if headt: pagemsg( "WARNING: Encountered headword twice without declension: old %s, current %s" % (unicode(headt), unicode(t))) return headt = t headtn = tn if tn == "de-conj": if not headt: pagemsg("WARNING: Encountered conj without headword: %s" % unicode(t)) return param4_ignorable = False if getparam(headt, "4") in ["h", "haben", "s", "sein"]: param4_ignorable = True for param in headt.params: pn = pname(param) pv = unicode(param.value) if not pv: continue if headtn == "head": allowed_params = ["1", "2", "head"] elif headtn == "de-verb-weak": allowed_params = ["1", "2", "3", "auxiliary", "cat"] elif headtn == "de-verb-strong": allowed_params = [ "1", "2", "3", "class", "class 2", "pres 2", "pres 2 qual", "past 2", "past 2 qual", "past participle 2", "past participle 2 qual", "past subjunctive", "past subjunctive 2", "past subjunctive 2 qual", "auxiliary", "cat" ] else: allowed_params = ["head"] if param4_ignorable: allowed_params.append("4") if pn not in allowed_params: pagemsg("WARNING: Encountered unknown param %s=%s in %s" % (pn, pv, unicode(headt))) return def canonicalize_existing(forms): forms = [re.sub(" '*or'* ", ",", form) for form in forms] forms = [ splitform for form in forms for splitform in form.split(",") ] return [blib.remove_links(form) for form in forms if form] def compare(old, new, entities_compared): if not old: return True if set(old) != set(new): pagemsg( "WARNING: Old %s %s disagree with new %s %s: head=%s, decl=%s" % (entities_compared, ",".join(old), entities_compared, ",".join(new), unicode(headt), unicode(t))) return False return True def fetch_aux(): aux = getparam(headt, "auxiliary") if aux in ["haben", "sein"]: aux = [aux] elif aux == "both": aux = ["haben", "sein"] elif not aux: aux = [] else: pagemsg( "WARNING: Unrecognized auxiliary=%s, skipping: %s" % (aux, unicode(headt))) return None if not aux: param4 = getparam(headt, "4") if param4 in ["h", "haben"]: aux = ["haben"] elif param4 in ["s", "sein"]: aux = ["sein"] return aux if headtn == "de-verb-weak": generate_template = re.sub( r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pres_3s = canonicalize_existing([getparam(headt, "1")]) past = canonicalize_existing([getparam(headt, "2")]) pp = canonicalize_existing([getparam(headt, "3")]) aux = fetch_aux() if aux is None: return if (not compare(pres_3s, forms.get("pres_3s", "-").split(","), "pres 3sgs") or not compare(past, forms.get("pret_3s", "-").split(","), "pasts") or not compare(pp, forms.get("perf_part", "-").split(","), "pp's") or not compare(aux, forms.get("aux", "-").split(","), "auxes")): headt = None continue if headtn == "de-verb-strong": generate_template = re.sub( r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pres_3s = canonicalize_existing( [getparam(headt, "1"), getparam(headt, "pres 2")]) past = canonicalize_existing( [getparam(headt, "2"), getparam(headt, "past 2")]) pp = canonicalize_existing([ getparam(headt, "3"), getparam(headt, "past participle 2") ]) past_subj = canonicalize_existing([ getparam(headt, "past subjunctive"), getparam(headt, "past subjunctive 2") ]) clazz = canonicalize_existing( [getparam(headt, "class"), getparam(headt, "class 2")]) aux = fetch_aux() if aux is None: return if (not compare(pres_3s, forms.get("pres_3s", "-").split(","), "pres 3sgs") or not compare(past, forms.get("pret_3s", "-").split(","), "pasts") or not compare(pp, forms.get("perf_part", "-").split(","), "pp's") or not compare(past_subj, forms.get("subii_3s", "-").split(","), "past subjs") or not compare(aux, forms.get("aux", "-").split(","), "auxes") or not compare(clazz, forms.get("class", "-").split(","), "classes")): headt = None continue del headt.params[:] blib.set_template_name(headt, "de-verb") arg1 = getparam(t, "1") if arg1: headt.add("1", arg1) notes.append("replace {{%s|...}} with new-style {{de-verb%s}}" % (headtn == "head" and "head|de|verb" or headtn, (arg1 and "|" + arg1 or ""))) headt = None if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_section(index, pagetitle, sectext): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") notes = [] conjt = None parsed = blib.parse_text(sectext) for t in parsed.filter_templates(): tn = tname(t) forms = {} if tn == "be-conj-manual": if conjt: pagemsg("WARNING: Saw two conjugation templates %s and %s, skipping" % (unicode(conjt), unicode(t))) return sectext, notes conjt = t if not conjt: pagemsg("WARNING: Couldn't find conjugation template") return sectext, notes autoconj = None for m in re.finditer("<!-- type (.*?) -->", sectext): if autoconj: pagemsg("WARNING: Saw two autoconj comments %s and %s, skipping" % ( autoconj, m.group(1))) return sectext, notes autoconj = m.group(1) autoconj = re.sub(" PPP[=:].*", "", autoconj) if " " in autoconj: pagemsg("WARNING: Space in autoconj, skipping: %s" % autoconj) return sectext, notes if not autoconj: pagemsg("WARNING: Couldn't find autoconj comment") return sectext, notes if not autoconj.startswith("(("): infinitive = getparam(conjt, "infinitive").strip() if not infinitive: pagemsg("WARNING: Couldn't find infinitive=: %s" % unicode(conjt)) return sectext, notes autoconj = "%s<%s>" % (infinitive, autoconj) tempcall = "{{User:Benwing2/be-generate-verb-forms|%s}}" % autoconj result = expand_text(tempcall) if not result: return sectext, notes pagemsg(result) predforms = blib.split_generate_args(result) forms = {} aspect = getparam(conjt, "aspect").strip() for slot in be_conj_slots: form = getparam(conjt, slot).strip() if form and form != "-": if slot.startswith("pres_futr_"): if aspect == "pf": forms[slot.replace("pres_", "")] = form else: forms[slot.replace("futr_", "")] = form else: forms[slot] = form if compare_forms(autoconj, forms, predforms, pagemsg): origt = unicode(conjt) conjt.name = "be-conj" del conjt.params[:] conjt.add("1", autoconj) newt = unicode(conjt) pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("replace {{be-conj-manual|...}} with %s" % newt) sectext = unicode(parsed) if notes: sectext = re.sub("<!-- type (.*?) -->", "", sectext) return sectext, notes
def process_page(index, page, save, verbose, nouns, adjectives): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if re.search(u"с[яь]$", pagetitle): pagemsg("Skipping reflexive verb") return text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-conj": if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) if "infinitive" not in args: # e.g. обнимать pagemsg("WARNING: No infinitive") continue infinitive = args["infinitive"] if "," in infinitive: pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive) continue if "//" in infinitive: pagemsg("WARNING: Infinitive has translit: %s" % infinitive) continue ppp = form_ppp(conjtype, pagetitle, args) if not ppp: continue if ppp.endswith(u"тый"): verbal_noun = re.sub(u"тый$", u"тие", ppp) verbal_noun_suffix = u"тие" verbal_adj = re.sub(u"тый$", u"тельный", ppp) verbal_adj_suffix = u"тельный" elif ppp.endswith(u"ённый"): verbal_noun = re.sub(u"ённый$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"ённый$", u"ительный", ppp) verbal_adj_suffix = u"ительный" elif ppp.endswith(u"енный"): verbal_noun = re.sub(u"енный$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"енный$", u"ительный", ppp) verbal_adj_suffix = u"ительный" else: assert ppp.endswith(u"анный") or ppp.endswith(u"янный") verbal_noun = re.sub(u"нный$", u"ние", ppp) verbal_adj = re.sub(u"нный$", u"тельный", ppp) m = re.search(u"(.)нный$", ppp) suffix_start = m.group(1) verbal_noun_suffix = suffix_start + u"ние" verbal_adj_suffix = suffix_start + u"тельный" agent_noun = re.sub(u"ный$", "", verbal_adj) agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix) stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix) stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix) stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix) if conjtype.startswith("7"): stem = getparam(t, "4") if infinitive.endswith(u"ть"): stem = stem.replace(u"ё", u"е́") else: stem = rulib.make_unstressed_ru(stem) stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-" elif conjtype.startswith("8"): stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-" else: stem = rulib.remove_monosyllabic_accents(infinitive) if verbal_noun in nouns: stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text) if not stressed_noun: msg("%s no-etym FIXME" % verbal_noun) elif stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % verbal_noun) else: if stressed_noun.endswith(stressed_verbal_noun_suffix): suffix = stressed_verbal_noun_suffix else: suffix = verbal_noun_suffix msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix)) if agent_noun in nouns: stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text) if stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % agent_noun) else: msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem)) if verbal_adj in adjectives: stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text) if stressed_adj == -1: pagemsg("Would add etym for %s but already has one" % verbal_adj) else: msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None): if not expand_text: def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, False) tn = tname(t) if tn in [ "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det", "la-pronoun" ]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: retval = getparam(t, "1") if "<" in retval or "((" in retval or " " in retval or "-" in retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-adj-forms") blib.remove_param_chain(generate_template, "comp", "comp") blib.remove_param_chain(generate_template, "sup", "sup") blib.remove_param_chain(generate_template, "adv", "adv") blib.remove_param_chain(generate_template, "lemma", "lemma") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg_m" in args: retval = args["linked_nom_sg_m"] elif "linked_nom_pl_m" in args: retval = args["linked_nom_pl_m"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") else: retval = re.sub("/.*", "", retval) elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg" in args: retval = args["linked_nom_sg"] elif "linked_nom_pl" in args: retval = args["linked_nom_pl"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in ["la-verb", "la-suffix-verb"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-verb-forms") rmparam(generate_template, "id") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) for slot in [ "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc", "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc" ]: if slot in args: retval = args[slot] break else: # no break pagemsg( "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or ( tn in ["la-suffix", "la-suffix-adv", "la-gerund"]): retval = getparam(t, "1") elif tn == "la-letter": retval = pagename elif tn in ["head", "la-prep"]: retval = blib.fetch_param_chain(t, "head", "head") elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates: retval = blib.fetch_param_chain(t, "1", "head") else: pagemsg("WARNING: Unrecognized headword template %s" % unicode(t)) retval = "" retval = retval or pagename if type(retval) is not list: retval = [retval] return retval
def process_page(page, index, parsed): global args verbose = args.verbose pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) parsed = blib.parse(page) headword_template = None see_template = None for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: if headword_template: pagemsg("WARNING: Multiple headword templates, skipping") return headword_template = t if unicode(t.name) in ["ru-decl-noun-see"]: if see_template: pagemsg( "WARNING: Multiple ru-decl-noun-see templates, skipping") return see_template = t if not headword_template: pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping") return if not see_template: pagemsg("WARNING: No ru-decl-noun-see templates, skipping") return del see_template.params[:] for param in headword_template.params: see_template.add(param.name, param.value) see_template.name = "ru-noun-table" if unicode(headword_template.name) == "ru-proper noun+": # Things are trickier for proper nouns because they default to n=sg, whereas # ru-noun-table defaults to n=both. We have to expand both templates and # fetch the value of n, and set it in ru-noun-table if not the same. # 1. Generate args for headword proper-noun template, using |ndef=sg # because ru-proper noun+ defaults to sg and ru-generate-noun-args # would otherwise default to both. headword_generate_template = re.sub(r"^\{\{ru-proper noun\+", "{{ru-generate-noun-args", unicode(headword_template)) headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}", headword_generate_template) headword_generate_result = expand_text(headword_generate_template) if not headword_generate_result: pagemsg("WARNING: Error generating ru-proper noun+ args") return None # 2. Fetch actual value of n. headword_args = blib.split_generate_args(headword_generate_result) headword_n = headword_args["n"] # 3. If sg, we always need to set n=sg explicitly in ru-noun-table. if headword_n == "s": see_template.add("n", "sg") # 4. If pl, leave alone, since both will default to plural only if the # lemma is pl, else n=pl needs to be set for both. elif headword_n == "p": pass # 5. If both, n=both had to have been set explicitly in the headword, # but it's the default in ru-noun-table unless the lemma is plural. # So remove n=both, generate the arguments, and see if the actual # value of args.n is b (for "both"); if not, set n=both. else: assert headword_n == "b" rmparam(see_template, "n") see_generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(see_template)) see_generate_result = expand_text(see_generate_template) if not see_generate_result: pagemsg("WARNING: Error generating ru-noun-table args") return None see_args = blib.split_generate_args(see_generate_result) if see_args["n"] != "b": see_template.add("n", "both") return unicode( parsed ), "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode( headword_template.name)
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = blib.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = blib.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword: del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runounlib.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runounlib.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) subpagetitle = re.sub(".*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) parsed = blib.parse(page) headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) headword_template = None if len(headword_templates) > 1: pagemsg( "WARNING: Multiple old-style headword templates, not sure which one to use, using none" ) for ht in headword_templates: pagemsg("Ignored headword template: %s" % unicode(ht)) elif len(headword_templates) == 0: pagemsg("WARNING: No old-style headword templates") else: headword_template = headword_templates[0] pagemsg("Found headword template: %s" % unicode(headword_template)) num_z_decl = 0 for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-z": num_z_decl += 1 pagemsg("Found z-decl template: %s" % unicode(t)) ru_noun_table_template = runounlib.convert_zdecl_to_ru_noun_table( t, subpagetitle, pagemsg, headword_template=headword_template) if not ru_noun_table_template: pagemsg("WARNING: Unable to convert z-decl template: %s" % unicode(t)) continue if headword_template: generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(ru_noun_table_template)) if unicode(headword_template.name) == "ru-proper noun": generate_template = re.sub(r"\}\}$", "|ndef=sg}}", generate_template) def pagemsg_with_proposed(text): pagemsg("Proposed ru-noun-table template: %s" % unicode(ru_noun_table_template)) pagemsg(text) generate_result = expand_text(unicode(generate_template)) if not generate_result: pagemsg_with_proposed( "WARNING: Error generating noun args, skipping") continue args = blib.split_generate_args(generate_result) # This will check number mismatch and animacy mismatch new_genders = runounlib.check_old_noun_headword_forms( headword_template, args, subpagetitle, pagemsg_with_proposed) if new_genders == None: continue origt = unicode(t) t.name = "ru-noun-table" del t.params[:] for param in ru_noun_table_template.params: t.add(param.name, param.value) pagemsg("Replacing z-decl %s with regular decl %s" % (origt, unicode(t))) if num_z_decl > 1: pagemsg("WARNING: Found multiple z-decl templates (%s)" % num_z_decl) return unicode(parsed), "Replace ru-decl-noun-z with ru-noun-table"
def process_page(page, index, parsed): pagetitle = unicode(page.title()) global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") heads = None headt = None headtn = None gender_and_animacy = None genitives = None plurals = None for t in parsed.filter_templates(): tn = tname(t) if tn in [args.lang + "-noun", args.lang + "-proper noun"]: if heads: pagemsg( "WARNING: Encountered headword twice without declension: %s" % unicode(t)) return headt = t headtn = tn heads = blib.fetch_param_chain(t, "1", "head") gender_and_animacy = blib.fetch_param_chain(t, "2", "g") genitives = blib.fetch_param_chain(t, "3", "gen") plurals = blib.fetch_param_chain(t, "4", "pl") genitive_plurals = blib.fetch_param_chain(t, "5", "genpl") if tn == args.lang + "-ndecl": if not heads: pagemsg("WARNING: Encountered decl without headword: %s" % unicode(t)) return generate_template = re.sub( r"^\{\{%s-ndecl\|" % args.lang, "{{User:Benwing2/%s-generate-prod-noun-props|" % args.lang, unicode(t)) result = expand_text(generate_template) if not result: return new_forms = blib.split_generate_args(result) new_g = new_forms["g"].split(",") def compare(old, new, stuff, nocanon=False): if not old: return True if not nocanon: remove_monosyllabic_accents = ( uk.remove_monosyllabic_stress if args.lang == "uk" else be.remove_monosyllabic_accents) old = [ remove_monosyllabic_accents(blib.remove_links(x)) for x in old ] new = [remove_monosyllabic_accents(x) for x in new] if set(old) != set(new): pagemsg( "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s" % (stuff, ",".join(old), stuff, ",".join(new), unicode(headt), unicode(t))) return False return True if not compare(gender_and_animacy, new_g, "gender", nocanon=True): heads = None continue is_plural = [x.endswith("-p") for x in new_g] if any(is_plural) and not all(is_plural): pagemsg( "WARNING: Mixture of plural-only and non-plural-only genders, can't process: %s" % unicode(t)) return is_plural = any(is_plural) if is_plural: if (not compare(heads, new_forms.get("nom_p", "-").split(","), "nom pl") or not compare(genitives, new_forms.get("gen_p", "-").split(","), "gen pl")): heads = None continue else: if (not compare(heads, new_forms.get("nom_s", "-").split(","), "nom sg") or not compare(genitives, new_forms.get("gen_s", "-").split(","), "gen sg") or # 'uk/be-proper noun' headwords don't have nominative plural set headtn == args.lang + "-noun" and not compare( plurals, new_forms.get("nom_p", "-").split(","), "nom pl") or headtn == args.lang + "-noun" and not compare( genitive_plurals, new_forms.get("gen_p", "-").split(","), "gen pl")): heads = None continue decl = getparam(t, "1") blib.set_param_chain(headt, [decl], "1", "head") blib.remove_param_chain(headt, "2", "g") blib.remove_param_chain(headt, "3", "gen") blib.remove_param_chain(headt, "4", "pl") blib.remove_param_chain(headt, "5", "genpl") notes.append("convert {{%s}} to new style using decl %s" % (unicode(headt.name), decl)) heads = None return unicode(parsed), notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errpagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) errmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if tname(t) in [ "ru-conj", "ru-conj-old", "User:Benwing2/ru-conj", "User:Benwing2/ru-conj-old" ] or tname(t) == "temp" and getparam(t, "1") == "ru-conj": verb_type, arg_sets = split_ru_conj_args(t, tname(t) == "temp") refl = "refl" in verb_type orig_arg_sets = copy.deepcopy(arg_sets) rm_pres_stem = False ##### First, modify arg_sets according to normalized params for arg_set in arg_sets: # This complex spec matches matches 3°a, 3oa, 4a1a, 6c1a, # 1a6a, 6a1as13, 6a1as14, etc. m = re.search(u"^([0-9]+[°o0-9abc]*[abc]s?1?[34]?)", arg_set[0]) if not m: m = re.search( u"^(irreg-?[абцдеѣфгчийклмнопярстувшхызёюжэщьъ%-]*)", arg_set[0]) if not m: errpagemsg("Unrecognized conjugation type: %s" % arg_set[0]) continue conj_type = m.group(1).replace("o", u"°") inf, tr = rulib.split_russian_tr(arg_set[1]) if refl: new_style = re.search(u"([тч]ься|ти́?сь)$", inf) else: new_style = re.search( u"([тч]ь|ти́?)$" if conj_type.startswith("7") or conj_type.startswith("irreg") else u"[тч]ь$", inf) if new_style: if arg_set[0].startswith("irreg-"): arg_set[0] = re.sub("^irreg-.*?(/.*|$)", r"irreg\1", arg_set[0]) arg_set[1] = rulib.paste_russian_tr( rulib.remove_monosyllabic_accents(inf), rulib.remove_tr_monosyllabic_accents(tr)) else: if not re.search("^[124]", conj_type): assert not tr if conj_type in ["1a", "2a", "2b"]: inf += u"ть" if tr: tr += u"tʹ" elif conj_type in ["3a", u"3°a"]: inf += u"нуть" elif conj_type in ["3b", u"3c"]: inf += u"у́ть" elif conj_type == "4a": inf += u"ить" if tr: tr += u"itʹ" elif conj_type in ["4b", "4c"]: inf, tr = rulib.make_unstressed( inf, rulib.decompose(tr)) inf += u"ить" if tr: tr += u"ítʹ" elif conj_type == "4a1a": inf = re.sub(u"[ая]$", "", inf) + u"ить" if tr: tr = re.sub("j?a$", "", tr) + u"itʹ" elif conj_type == "5a": inf = arg_set[2] + u"ть" if arg_set[ 2] else arg_set[1] + u"еть" normal_pres_stem = re.sub(u"[еая]ть$", "", inf) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif conj_type == "5b": inf = arg_set[2] + u"ть" normal_pres_stem = re.sub(u"[еая]́ть$", "", inf) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif conj_type == "5c": inf = arg_set[2] + u"ть" normal_pres_stem = rulib.make_ending_stressed_ru( re.sub(u"[еая]́ть$", "", inf)) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif re.search(u"^6°?a", conj_type) or conj_type == "1a6a": assert not arg_set[3] if arg_set[2]: inf = arg_set[2] + u"ть" arg_set[2] = "" normal_pres_stem = rulib.make_ending_stressed_ru( re.sub(u"а́ть$", "", inf)) assert arg_set[1] == normal_pres_stem elif is_vowel_stem(inf): inf += u"ять" else: inf += u"ать" if getparam(t, "pres_stem"): arg_set[2] = getparam(t, "pres_stem") rm_pres_stem = True elif re.search(u"^6°?b", conj_type): if is_vowel_stem(inf): inf += u"я́ть" else: inf += u"а́ть" # arg_set[2] (present stem) remains elif re.search(u"^6°?c", conj_type): inf = rulib.make_unstressed_once_ru(inf) + u"а́ть" elif conj_type in ["7a", "7b"]: pass # nothing needed to do elif conj_type in ["8a", "8b"]: inf = arg_set[2] arg_set[2] = arg_set[1] elif conj_type == "9a": inf += u"еть" # arg_set[2] (present stem) remains elif conj_type == "9b": inf = rulib.make_unstressed_once_ru(inf) + u"е́ть" # arg_set[2] (present stem) remains # arg_set[3] (optional past participle stem) remains elif conj_type == "10a": inf += u"оть" elif conj_type == "10c": inf += u"ть" if rulib.make_unstressed_once_ru(arg_set[2]) == re.sub( u"о́$", "", arg_set[1]): arg_set[2] = "" elif conj_type == "11a": inf += u"ить" elif conj_type == "11b": inf += u"и́ть" if arg_set[2] == arg_set[1]: arg_set[2] = "" elif conj_type == "12a": inf += u"ть" if arg_set[2] == arg_set[1]: arg_set[2] = "" elif conj_type == "12b": inf += u"ть" if rulib.make_ending_stressed_ru( arg_set[2]) == arg_set[1]: arg_set[2] = "" elif conj_type == "13b": inf += u"ть" assert re.sub(u"ва́ть$", "", inf) == arg_set[2] arg_set[2] = "" elif conj_type in ["14a", "14b", "14c"]: inf += u"ть" # arg_set[2] (present stem) remains elif conj_type in ["15a", "16a", "16b"]: inf += u"ть" elif conj_type == u"irreg-минуть": inf = u"мину́ть" elif conj_type == u"irreg-живописать-миновать": inf += u"ть" arg_set[2] = "" elif conj_type == u"irreg-слыхать-видать": inf += u"ть" elif conj_type == u"irreg-стелить-стлать": inf = arg_set[2] + inf + u"ть" arg_set[2] = "" arg_set[3] = "" elif conj_type == u"irreg-ссать-сцать": assert arg_set[2] == re.sub(u"а́$", "", inf) inf = arg_set[3] + inf + u"ть" arg_set[2] = "" arg_set[3] = "" elif conj_type in [ u"irreg-сыпать", u"irreg-ехать", u"irreg-ѣхать" ]: infstem = re.sub("^irreg-", "", conj_type) if arg_set[1] != u"вы́": infstem = rulib.make_beginning_stressed_ru(infstem) inf = arg_set[1] + infstem elif conj_type == u"irreg-обязывать": if arg_set[1] == u"вы́": inf = u"вы́обязывать" else: inf = arg_set[1] + u"обя́зывать" elif conj_type == u"irreg-зиждиться": if arg_set[1] == u"вы́": inf = u"вы́зиждить" else: inf = arg_set[1] + u"зи́ждить" elif conj_type == u"irreg-идти": if not arg_set[1]: inf = u"идти́" elif arg_set[1] == u"вы́": inf = u"вы́йти" else: inf = arg_set[1] + u"йти́" elif re.search("^irreg-", conj_type): infstem = re.sub("^irreg-", "", conj_type) if arg_set[1] != u"вы́": infstem = rulib.make_ending_stressed_ru(infstem) inf = arg_set[1] + infstem else: error("Unknown conjugation type " + conj_type) if inf: if refl: if re.search(u"[тч]ь$", inf): inf += u"ся" if tr: tr += "sja" else: assert re.search(u"и́?$", inf) inf += u"сь" if tr: tr += u"sʹ" arg_set[1] = rulib.paste_russian_tr( rulib.remove_monosyllabic_accents(inf), rulib.remove_tr_monosyllabic_accents(tr)) ##### If something changed ... if orig_arg_sets != arg_sets or rm_pres_stem: ##### ... compare the forms generated by the original and new ##### arguments and make sure they're the same. if not pagetitle.startswith("User:Benwing2/"): # 1. Generate and expand the appropriate call to # {{ru-generate-verb-forms}} for the original arguments. orig_args = paste_arg_sets(orig_arg_sets, t, verb_type, rm_pres_stem=False, as_string=True) orig_tempcall = "{{ru-generate-verb-forms|%s%s}}" % ( "|".join(orig_args), "|old=1" if tname(t).endswith("ru-conj-old") else "") orig_result = expand_text(orig_tempcall) if not orig_result: errpagemsg( "WARNING: Error expanding original template %s" % orig_tempcall) continue orig_forms = blib.split_generate_args(orig_result) # 2. Generate and expand the appropriate call to # {{ru-generate-verb-forms}} for the new arguments. new_args = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=True) new_tempcall = "{{ru-generate-verb-forms|%s%s}}" % ( "|".join(new_args), "|old=1" if tname(t).endswith("ru-conj-old") else "") new_result = expand_text(new_tempcall) if not new_result: errpagemsg("WARNING: Error expanding new template %s" % new_tempcall) continue new_forms = blib.split_generate_args(new_result) # 3. Compare each form and accumulate a list of mismatches. all_keys = set(orig_forms.keys()) | set(new_forms.keys()) def sort_numbers_first(key): if re.search("^[0-9]+$", key): return "%05d" % int(key) return key all_keys = sorted(list(all_keys), key=sort_numbers_first) mismatches = [] for key in all_keys: origval = orig_forms.get(key, "<<missing>>") newval = new_forms.get(key, "<<missing>>") if origval != newval: mismatches.append("%s: old=%s new=%s" % (key, origval, newval)) # 4. If mismatches, output them and don't change anything. if mismatches: errpagemsg( "WARNING: Mismatch comparing old %s to new %s: %s" % (orig_tempcall, new_tempcall, " || ".join(mismatches))) continue # 5. If no mismatches, modify the template to contain the new args. new_params = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=False, is_temp=tname(t) == "temp") del t.params[:] if tname(t) == "temp": t.add("1", "ru-conj") for name, value in new_params: t.add(name, value) # 6. Build up the save comment. orig_changed_params = paste_arg_sets(orig_arg_sets, t, verb_type, rm_pres_stem=False, as_string=True, change_only=True) new_changed_params = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=True, change_only=True) notes.append("ru-conj: normalized %s to %s" % ("|".join(orig_changed_params), "|".join(new_changed_params))) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def lookup_heads_and_inflections(pagename, pagemsg): if semi_verbose: pagemsg("lookup_heads_and_inflections: Finding heads on page %s" % pagename) # Use our own expand_text() rather than passing it from the caller, # which may have a different value for PAGENAME; the proper value is # important in expanding certain templates e.g. ru-generate-adj-forms. def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose) if pagename in terms_to_ignore: pagemsg( "lookup_heads_and_inflections: Ignoring term because in terms_to_ignore: %s" % pagename) return "manual-override", None if pagename in manually_specified_inflections: accented, lemma = manually_specified_inflections[pagename] if lemma is True: return "manual-override", ({(accented, "", True)}, set(), set()) else: return "manual-override", ({(accented, "", False)}, { (frozenset({(accented, "", False)}), lemma) }, set()) global num_cache_lookups num_cache_lookups += 1 if pagename in accented_cache: global num_cache_hits num_cache_hits += 1 result = accented_cache[pagename] if result is None: if semi_verbose: pagemsg( "lookup_heads_and_inflections: Page %s doesn't exist (cached)" % pagename) elif result == "redirect": if semi_verbose: pagemsg( "lookup_heads_and_inflections: Page %s is redirect (cached)" % pagename) elif result == "no-russian": if semi_verbose: pagemsg( "lookup_heads_and_inflections: Page %s has no Russian section (cached)" % pagename) return True, result elif "\n" in pagename: pagemsg( "WARNING: lookup_heads_and_inflections: Bad pagename (has newline in it): %s" % pagename) if not global_disable_cache: accented_cache[pagename] = None return False, None else: cached = False page = pywikibot.Page(site, pagename) try: if not page.exists(): if semi_verbose: pagemsg( "lookup_heads_and_inflections: Page %s doesn't exist" % pagename) if not global_disable_cache: accented_cache[pagename] = None return False, None except Exception as e: pagemsg( "WARNING: lookup_heads_and_inflections: Error checking page existence: %s" % unicode(e)) if not global_disable_cache: accented_cache[pagename] = None return False, None # Page exists, is it a redirect? if re.match("#redirect", page.text, re.I): if not global_disable_cache: accented_cache[pagename] = "redirect" pagemsg("lookup_heads_and_inflections: Page %s is redirect" % pagename) return False, "redirect" # Page exists and is not a redirect, find the info heads = set() inflections_of = set() adj_forms = set() foundrussian = False sections = re.split("(^==[^=]*==\n)", unicode(page.text), 0, re.M) for j in xrange(2, len(sections), 2): if sections[j - 1] == "==Russian==\n": if foundrussian: pagemsg( "WARNING: lookup_heads_and_inflections: Found multiple Russian sections" ) break foundrussian = True subsections = re.split("(^===+[^=\n]+===+\n)", sections[j], 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) this_heads = set() def add(val, tr, is_lemma): val_to_add = blib.remove_links(val) # Remove monosyllabic accents to correctly handle the case of # рад, which has some heads with an accent and some without. val_to_add, tr = remove_monosyllabic_accents( val_to_add, tr) this_heads.add((val_to_add, tr, is_lemma)) for t in parsed.filter_templates(): tname = unicode(t.name) check_addl_heads = False if tname in ru_head_templates: is_lemma = tname in ru_lemma_templates check_addl_heads = True if getparam(t, "1"): add(getparam(t, "1"), getparam(t, "tr"), is_lemma) elif getparam(t, "head"): add(getparam(t, "head"), getparam(t, "tr"), is_lemma) else: add(pagename, "", is_lemma) elif tname == "head" and getparam(t, "1") == "ru": is_lemma = getparam(t, "2") in ru_lemma_poses check_addl_heads = True if getparam(t, "head"): add(getparam(t, "head"), getparam(t, "tr"), is_lemma) else: add(pagename, "", is_lemma) elif tname in ["ru-noun+", "ru-proper noun+"]: is_lemma = True lemma = rulib.fetch_noun_lemma(t, expand_text) lemmas = re.split(",", lemma) lemmas = [ split_ru_tr(lemma, pagemsg) for lemma in lemmas ] # Group lemmas by Russian, to group multiple translits lemmas = rulib.group_translits( lemmas, pagemsg, semi_verbose) for val, tr in lemmas: add(val, tr, is_lemma) elif (tname == "ru-participle of" or tname in inflection_templates and getparam(t, "lang") == "ru"): inflections_of.add( (frozenset(this_heads), normalize_text(getparam(t, "1")))) if check_addl_heads: for i in xrange(2, 10): headn = getparam(t, "head" + str(i)) if headn: add(headn, getparam(t, "tr" + str(i)), is_lemma) elif tname == "ru-decl-adj": result = expand_text( re.sub(r"^\{\{ru-decl-adj", "{{ru-generate-adj-forms", unicode(t))) if not result: pagemsg( "WARNING: lookup_heads_and_inflections: Error expanding template %s, page %s" % (unicode(t), pagename)) else: args = blib.split_generate_args(result) for value in args.itervalues(): adj_forms.add(value) heads.update(this_heads) # Page exists, is it a redirect? if not foundrussian: if not global_disable_cache: accented_cache[pagename] = "no-russian" pagemsg( "lookup_heads_and_inflections: Page %s has no Russian section" % pagename) return False, "no-russian" saw_lemma = any(is_lemma for ru, tr, is_lemma in heads) if not saw_lemma and not inflections_of: # If no lemmas or inflections found, check for alt-ё templates. # If the term is a non-ё variant of a single term with ё, look up # and return the heads and inflections on that page. parsed = blib.parse_text(unicode(page.text)) yo_pages = set() for t in parsed.filter_templates(): if unicode(t.name) in alt_yo_templates: yo_pages.add(getparam(t, "1")) if len(yo_pages) > 1: pagemsg( u"WARNING: lookup_heads_and_inflections: Found multiple alt-ё templates for different lemmas: %s" % ",".join(yo_pages)) elif len(yo_pages) == 0: pagemsg( "WARNING: lookup_heads_and_inflections: Found no lemmas or inflections of lemmas for %s" % pagename) else: yoful_page = list(yo_pages)[0] pagemsg( "lookup_heads_and_inflections: Redirecting from %s to %s" % (pagename, yoful_page)) return lookup_heads_and_inflections(yoful_page, pagemsg) cacheval = (heads, inflections_of, adj_forms) if not global_disable_cache: accented_cache[pagename] = cacheval return False, cacheval
for param in generate_template.params: proposed_decl.add(param.name, param.value) def pagemsg_with_proposed(text): pagemsg( "Proposed new template (WARNING, omits explicit gender and params to preserve from old template): %s" % proposed_template_text) pagemsg(text) if headword_is_proper: generate_template.add("ndef", "sg") generate_result = expand_text(unicode(generate_template)) if not generate_result: pagemsg_with_proposed("WARNING: Error generating noun args, skipping") return genargs = blib.split_generate_args(generate_result) if headword_is_proper and genargs["n"] == "s" and not getparam( proposed_decl, "n"): proposed_decl.add("n", "sg") # This will check number mismatch (and animacy mismatch, but that shouldn't # occur as we've taken the animacy directly from the headword) new_genders = runounlib.check_old_noun_headword_forms( headword_template, genargs, subpagetitle, pagemsg_with_proposed, laxer_comparison=True) if new_genders == None: return None