def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "#invoke:form of/templates" and getparam( t, "1") == "template_tags": t.add("1", "tagged_form_of_t") notes.append( "Rewrite {{#invoke:form of/templates|template_tags}} with {{#invoke:form of/templates|tagged_form_of_t}}" ) if tn == "#invoke:form of" and getparam( t, "1") in ["form_of_t", "alt_form_of_t"]: ignorelist = blib.fetch_param_chain(t, "ignorelist", "ignorelist") if ignorelist: ignore = blib.fetch_param_chain(t, "ignore", "ignore") for il in ignorelist: ignore.append(il + ":list") blib.set_param_chain(t, ignore, "ignore", "ignore", before="ignorelist") blib.remove_param_chain(t, "ignorelist", "ignorelist") blib.set_template_name(t, "#invoke:form of/templates") notes.append( "Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}" % getparam(t, "1")) if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t": t.add("2", getparam(t, "text"), before="text") rmparam(t, "text") if t.has("nocap"): rmparam(t, "nocap") else: t.add("withcap", "1") if t.has("nodot"): rmparam(t, "nodot") else: t.add("withdot", "1") t.add("1", "form_of_t") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") head = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "uk-noun": gen = blib.fetch_param_chain(t, "3", "gen") if len(gen) == 1 and gen[0].endswith(u"і"): gen2 = gen[0][0:-1] + u"и" t.add("gen2", gen2, before="4") elif tn in ["uk-decl-noun", "uk-decl-noun-unc", "uk-decl-noun-pl"]: gensparam = 3 if tn == "uk-decl-noun" else 2 gens = getparam(t, str(gensparam)) if "," not in gens and gens.endswith(u"і"): gens += ", " + gens[0:-1] + u"и" t.add(str(gensparam), gens) if origt != unicode(t): notes.append(u"add alternative genitive singular to Ukrainian nouns ending in -сть") pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def get_head_param(t, pagetitle): tn = tname(t) if tn in [ "ang-adj", "ang-adj-comp", "ang-adj-sup", "ang-adv", "ang-adv-comp", "ang-adv-sup", "ang-verb" ]: retval = blib.fetch_param_chain(t, "1", "head") elif tn in [ "ang-noun", "ang-noun-form", "ang-verb-form", "ang-adj-form", "ang-con", "ang-prep", "ang-prefix", "ang-proper noun", "ang-suffix" ]: retval = blib.fetch_param_chain(t, "head", "head") elif tn == "head" and getparam(t, "1") == "ang": retval = blib.fetch_param_chain(t, "head", "head") else: return None return retval or [pagetitle]
def process_verb_headt(t): origt = unicode(t) def getp(param): return getparam(t, param) tr = getp("tr") if getp("2"): head = getp("1") g = getp("2") else: head = getp("head") g = getp("1") or getp("a") pf = blib.fetch_param_chain(t, "pf", "pf") impf = blib.fetch_param_chain(t, "impf", "impf") must_continue = False for param in t.params: pn = pname(param) if pn not in ["head", "tr", "1", "a", "2", "pf", "pf2", "pf3", "impf", "impf2", "impf3"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False del t.params[:] if not head: head = pagetitle if belib.needs_accents(head): pagemsg("WARNING: Head %s missing accents: %s" % (head, origt)) if not g: pagemsg("WARNING: No aspect in verb headword: %s" % origt) g = "?" t.add("1", head) if tr: t.add("tr", tr) t.add("2", g) blib.set_param_chain(t, pf, "pf", "pf") blib.set_param_chain(t, impf, "impf", "impf") if origt != unicode(t): notes.append("fix up {{be-verb}} to use new param convention") pagemsg("Replaced %s with %s" % (origt, unicode(t))) return True
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") parsed = blib.parse(page) found_page_head = False for t in parsed.filter_templates(): found_this_head = False if tname(t) in ru_normal_head_templates: heads = blib.fetch_param_chain(t, "1", "head") for head in heads: if has_secondary_stress(head): pagemsg("Found secondarily stressed head %s in %s" % (head, unicode(t))) elif tname(t) == "head" and getparam(t, "1") == "ru": heads = blib.fetch_param_chain(t, "head", "head") for head in heads: if has_secondary_stress(head): pagemsg("Found secondarily stressed head %s in %s" % (head, unicode(t))) elif tname(t) in [ "ru-noun+", "ru-proper noun+", "ru-noun-table", "ru-noun-old" ]: per_word_objs = runounlib.split_noun_decl_arg_sets(t, pagemsg) for per_word in per_word_objs: for arg_set in per_word: if has_secondary_stress(arg_set[1]): pagemsg("Found secondarily stressed head %s in %s" % (arg_set[1], unicode(t))) elif tname(t) == "ru-decl-adj": head = getparam(t, "1") if has_secondary_stress(head): pagemsg("Found secondarily stressed head %s in %s" % (head, unicode(t)))
def process_page(index, page, save, verbose, nouns): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if not re.search(u"[иы]й$", pagetitle): pagemsg(u"Skipping adjective not in -ый or -ий") return noun = re.sub(u"[иы]й$", u"ость", pagetitle) if noun not in nouns: return text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == u"ru-adj-alt-ё": pagemsg(u"Skipping alt-ё adjective") return for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-adj": heads = blib.fetch_param_chain(t, "1", "head", pagetitle) if len(heads) > 1: pagemsg("Skipping adjective with multiple heads: %s" % ",".join(heads)) return tr = getparam(t, "tr") nounsection = blib.find_lang_section(noun, "Russian", pagemsg, errandpagemsg) if not nounsection: pagemsg("Couldn't find Russian section for %s" % noun) continue if "==Etymology" in nounsection: pagemsg("Noun %s already has etymology" % noun) continue if tr: msg(u"%s %s+tr1=%s+-ость no-etym" % (noun, heads[0], tr)) else: msg(u"%s %s+-ость no-etym" % (noun, heads[0]))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse(page) for t in parsed.filter_templates(): if unicode(t.name) == "R:vep:UVVV": refpages = blib.fetch_param_chain(t, "1", "") for refpage in refpages: if not pywikibot.Page(site, refpage).exists(): pagemsg("Page [[%s]] does not exist" % refpage)
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse(page) for t in parsed.filter_templates(): if unicode(t.name) == "R:vep:UVVV": refpages = blib.fetch_param_chain(t, "1", "") for refpage in refpages: if not pywikibot.Page(site, refpage).exists(): pagemsg("Page [[%s]] does not exist" % refpage)
def find_noun_lemmas(parsed, pagetitle, errandpagemsg, expand_text): noun_lemmas = [] for t in parsed.filter_templates(): if tname(t) in ["ru-noun+", "ru-proper noun+"]: lemmaarg = rulib.fetch_noun_lemma(t, expand_text) if lemmaarg is None: errandpagemsg("WARNING: Error generating noun forms: %s" % unicode(t)) return else: for lemma in re.split(",", lemmaarg): add_if_not(noun_lemmas, lemma) elif tname(t) in ["ru-noun", "ru-proper noun"]: for lemma in blib.fetch_param_chain(t, "1", "head", pagetitle): add_if_not(noun_lemmas, lemma) return noun_lemmas
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "ru-adj": comps = blib.fetch_param_chain(t, "2", "comp") newcomps = [] for comp in comps: if re.search(u"е́?й$", comp): regcomp = re.sub(u"(е́?)й$", ur"\1е", comp) if regcomp in newcomps: pagemsg("Skipping informal form %s" % comp) notes.append("remove informal comparative %s" % comp) else: pagemsg("WARNING: Found informal form %s without corresponding regular form") newcomps.append(comp) else: newcomps.append(comp) if comps != newcomps: blib.set_param_chain(t, newcomps, "2", "comp") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def extract_headword_anim_spec(headword_template): genders = blib.fetch_param_chain(headword_template, "2", "g") saw_in = -1 saw_an = -1 for i,g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0 and saw_in < saw_an: return "ia" elif saw_in >= 0 and saw_an >= 0: return "ai" elif saw_an >= 0: return "an" elif saw_in >= 0: return "in" else: return None
def extract_headword_anim_spec(headword_template): genders = blib.fetch_param_chain(headword_template, "2", "g") saw_in = -1 saw_an = -1 for i, g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0 and saw_in < saw_an: return "ia" elif saw_in >= 0 and saw_an >= 0: return "ai" elif saw_an >= 0: return "an" elif saw_in >= 0: return "in" else: return None
def find_adj(pagename, pagemsg, errandpagemsg, expand_text): section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg) if not section: return None if "==Etymology" in section: return -1 parsed = blib.parse_text(section) adjs = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-adj": heads = blib.fetch_param_chain(t, "1", "head", pagename) if len(heads) > 1: pagemsg("WARNING: Multiple lemmas for adjective: %s" % ",".join(heads)) return None if heads[0] not in adjs: adjs.append(heads[0]) if len(adjs) > 1: pagemsg("WARNING: Multiple lemmas for adjective: %s" % ",".join(adjs)) if not adjs: return None return adjs[0]
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "ru-adj": comps = blib.fetch_param_chain(t, "2", "comp") newcomps = [] for comp in comps: if re.search(u"е́?й$", comp): regcomp = re.sub(u"(е́?)й$", ur"\1е", comp) if regcomp in newcomps: pagemsg("Skipping informal form %s" % comp) notes.append("remove informal comparative %s" % comp) else: pagemsg( "WARNING: Found informal form %s without corresponding regular form" ) newcomps.append(comp) else: newcomps.append(comp) if comps != newcomps: blib.set_param_chain(t, newcomps, "2", "comp") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg, laxer_comparison=False): # FORM1 is the forms from ru-noun (or ru-proper noun); FORM2 is the combined # set of forms from ru-noun-table, and needs to be split on commas. # FORM1_LEMMA is true if the FORM1 values come from the ru-noun lemma. def compare_forms(case, form1, form2, form1_lemma=False): # Split on individual words and allow monosyllabic accent differences. # FIXME: Will still have problems with [[X|Y]]. def compare_single_form(f1, f2): words1 = re.split("[ -]", f1) words2 = re.split("[ -]", f2) if len(words1) != len(words2): return None for i in xrange(len(words1)): if words1[i] != words2[i]: w1 = fixup_link(words1[i]) w2 = words2[i] # Allow case where existing is monosyllabic and missing a stress # compared with proposed w1 = {w1, try_to_stress(w1)} # Allow case where existing is missing a link as compared to # proposed (but not other way around; we don't want a link # disappearing) w2 = {w2, blib.remove_links(w2)} if not (w1 & w2): return None return True form1 = [fixup_link(re.sub(u"ё́", u"ё", x)) for x in form1] form2 = re.split(",", form2) if laxer_comparison or not form1_lemma: # Ignore manual translit in decl forms when comparing non-lemma forms; # not available from ru-noun (and not displayed anyway); also when # laxer_comparison is set, which happens in add_noun_decl form2 = [re.sub("//.*$", "", x) for x in form2] # If existing value missing, OK; also allow for unstressed monosyllabic # existing form matching stressed monosyllabic new form if form1: if (set(form1) == set(form2) or set(try_to_stress(x) for x in form1) == set(form2) or len(form1) == 1 and len(form2) == 1 and compare_single_form(form1[0], form2[0])): pass else: pagemsg("WARNING: case %s, existing forms %s not same as proposed %s" %( case, ",".join(form1), ",".join(form2))) return None return True def compare_genders(g1, g2): if set(g1) == set(g2): return True if len(g1) == 1 and len(g2) == 1: # If genders don't match exactly, check if existing gender is missing # animacy and allow that, so it gets overwritten with new gender if g1[0] == re.sub("-(an|in)", "", g2[0]): pagemsg("Existing gender %s missing animacy spec compared with proposed %s, allowed" % ( ",".join(g1), ",".join(g2))) return True return None headwords = blib.fetch_param_chain(headword_template, "1", "head", subpagetitle) translits = blib.fetch_param_chain(headword_template, "tr", "tr") for i in xrange(len(translits)): if len(headwords) <= i: pagemsg("WARNING: Not enough headwords for translit tr%s=%s, skipping" % ( "" if i == 0 else str(i+1), translits[i])) return None else: headwords[i] += "//" + translits[i] genitives = blib.fetch_param_chain(headword_template, "3", "gen") plurals = blib.fetch_param_chain(headword_template, "4", "pl") genders = blib.fetch_param_chain(headword_template, "2", "g") cases_to_check = None if args["n"] == "s": if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or not compare_forms("gen_sg", genitives, args["gen_sg"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_sg", "gen_sg"] elif args["n"] == "p": if (not compare_forms("nom_pl", headwords, args["nom_pl_linked"], True) or not compare_forms("gen_pl", genitives, args["gen_pl"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_pl", "gen_pl"] elif args["n"] == "b": if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or not compare_forms("gen_sg", genitives, args["gen_sg"]) or not compare_forms("nom_pl", plurals, args["nom_pl"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_sg", "gen_sg", "nom_pl"] else: pagemsg("WARNING: Unrecognized number spec %s, skipping" % args["n"]) return None for case in cases_to_check: raw_case = re.sub(u"△", "", blib.remove_links(args[case + "_raw"])) if args[case] != raw_case: pagemsg("WARNING: Raw case %s=%s contains footnote symbol" % ( case, args[case + "_raw"])) proposed_genders = re.split(",", args["g"]) if compare_genders(genders, proposed_genders): genders = [] else: # Check for animacy mismatch, punt if so cur_in = [x for x in genders if re.search(r"\bin\b", x)] cur_an = [x for x in genders if re.search(r"\ban\b", x)] proposed_in = [x for x in proposed_genders if re.search(r"\bin\b", x)] proposed_an = [x for x in proposed_genders if re.search(r"\ban\b", x)] if (cur_in or not cur_an) and proposed_an or (cur_an or not cur_in) and proposed_in: pagemsg("WARNING: Animacy mismatch, skipping: cur=%s proposed=%s" % ( ",".join(genders), ",".join(proposed_genders))) return None # Check for number mismatch, punt if so cur_pl = [x for x in genders if re.search(r"\bp\b", x)] if cur_pl and args["n"] != "p" or not cur_pl and args["n"] == "p": pagemsg("WARNING: Number mismatch, skipping: cur=%s, proposed=%s, n=%s" % ( ",".join(genders), ",".join(proposed_genders), args["n"])) return None pagemsg("WARNING: Gender mismatch, existing=%s, new=%s" % ( ",".join(genders), ",".join(proposed_genders))) return genders
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") text = unicode(page.text) origtext = text notes = [] retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M) saw_a_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) la_verb_template = None la_conj_template = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "la-conj": if la_conj_template: pagemsg( "WARNING: Saw multiple verb conjugation templates in subsection, %s and %s, skipping" % (unicode(la_conj_template), unicode(t))) must_continue = True break la_conj_template = t saw_a_template = True if tn == "la-verb": if la_verb_template: pagemsg( "WARNING: Saw multiple verb headword templates in subsection, %s and %s, skipping" % (unicode(la_verb_template), unicode(t))) must_continue = True break la_verb_template = t saw_a_template = True if must_continue: continue if not la_verb_template and not la_conj_template: continue if la_verb_template and not la_conj_template: pagemsg( "WARNING: Saw verb headword template but no conjugation template: %s" % unicode(la_verb_template)) continue if la_conj_template and not la_verb_template: pagemsg( "WARNING: Saw verb conjugation template but no headword template: %s" % unicode(la_conj_template)) continue orig_la_verb_template = unicode(la_verb_template) if re.search(r"^(irreg|[0-9]\+*)(\..*)?$", getparam(la_verb_template, "1")): pagemsg("Found new-style verb headword template, skipping: %s" % orig_la_verb_template) continue def render_headword_and_conj(): return "headword template <from> %s <to> %s <end>, conjugation template <from> %s <to> %s <end>" % ( orig_la_verb_template, orig_la_verb_template, unicode(la_conj_template), unicode(la_conj_template)) verb_props = new_generate_verb_forms(unicode(la_conj_template), errandpagemsg, expand_text, include_props=True) if verb_props is None: continue subtypes = [ x.replace("-", "") for x in safe_split(verb_props["subtypes"], ".") ] conj_type = verb_props["conj_type"] conj_subtype = verb_props.get("conj_subtype", None) def compare_headword_conj_forms(id_slot, headword_forms, conj_slots, adjust_for_missing_perf_forms=False, remove_conj_links=False): conj_forms = "" for slot in conj_slots: if slot in verb_props: conj_forms = verb_props[slot] break conj_forms = safe_split(conj_forms, ",") if remove_conj_links: conj_forms = [blib.remove_links(x) for x in conj_forms] corrected_headword_forms = [ lengthen_ns_nf(x) for x in headword_forms ] corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms] if adjust_for_missing_perf_forms: # There are several instances of 4++ verbs where only the -īvī variant, # not the -iī variant, is listed in the headword. Don't get tripped up # by that. ivi_conj_forms = [ x for x in corrected_conj_forms if x.endswith(u"īvī") ] for ivi_conj_form in ivi_conj_forms: ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form) if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms: corrected_headword_forms.append(ii_conj_form) if set(corrected_headword_forms) != set(corrected_conj_forms): macronless_headword_forms = set( lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_conj_forms = set( lalib.remove_macrons(x) for x in corrected_conj_forms) if macronless_headword_forms == macronless_conj_forms: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) else: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) return False return True verb_conj = getparam(la_verb_template, "conj") or getparam( la_verb_template, "c") pattern = getparam(la_verb_template, "pattern") lemma = blib.fetch_param_chain(la_verb_template, ["1", "head", "head1"], "head") inf = blib.fetch_param_chain(la_verb_template, ["2", "inf", "inf1"], "inf") perf = blib.fetch_param_chain(la_verb_template, ["3", "perf", "perf1"], "perf") sup = blib.fetch_param_chain(la_verb_template, ["4", "sup", "sup1"], "sup") # Hack to handle cases like abeō where the headword normally lists perfect # abiī but the conj lists abiī, abīvī. if verb_conj == "irreg" and len(lemma) > 0 and lemma[0].endswith( u"eō"): ivi = re.sub(u"eō$", u"īvī", lemma[0]) if ivi not in perf: perf.append(ivi) if not compare_headword_conj_forms("lemma", lemma, [ "1s_pres_actv_indc", "3s_pres_actv_indc", "1s_perf_actv_indc", "3s_perf_actv_indc" ]): continue if "depon" in subtypes or "semidepon" in subtypes: if sup: pagemsg( "WARNING: Saw supine in conjunction with deponent verb, skipping: %s" % render_headword_and_conj()) continue sup = [re.sub("[sm]( (sum|est))?$", "m", x) for x in perf] else: if not compare_headword_conj_forms( "perfect", perf, ["1s_perf_actv_indc", "3s_perf_actv_indc"], adjust_for_missing_perf_forms=True, # Remove links from perfect to handle cases like adsoleō where the # perfect is adsoluī,[[adsolitus]] [[sum]] and the headword says # adsoluī,adsolitus sum. remove_conj_links=True): continue if len(sup) > 0 and sup[0].endswith(u"ūrus"): if not compare_headword_conj_forms("future participle", sup, ["futr_actv_ptc"]): continue if "supfutractvonly" not in subtypes: if len(lemma) > 0 and lemma[0].endswith("sum"): pass else: pagemsg( "WARNING: Expected supfutractvonly in subtypes=%s, skipping: %s" % (".".join( sorted(subtypes)), render_headword_and_conj())) continue else: if not compare_headword_conj_forms("supine", sup, ["sup_acc"]): continue if not verb_conj: pagemsg("WARNING: No conj in headword template: %s" % render_headword_and_conj()) else: conj_type_to_verb_conj = { "1st": "1", "2nd": "2", "3rd": "3", "3rd-io": "io", "4th": "4", "irreg": "irreg", } if conj_type not in conj_type_to_verb_conj: pagemsg( "WARNING: Something wrong, saw unrecognized conj_type=%s: %s" % (conj_type, render_headword_and_conj())) continue conj_type = conj_type_to_verb_conj[conj_type] if conj_subtype: if conj_subtype not in conj_type_to_verb_conj: pagemsg( "WARNING: Something wrong, saw unrecognized conj_subtype=%s" % (conj_subtype, render_headword_and_conj())) continue conj_subtype = conj_type_to_verb_conj[conj_subtype] if verb_conj != conj_type and verb_conj != conj_subtype: pagemsg( "WARNING: Conjugation template has conj=%s, subconj=%s but headword template has conj=%s, skipping: %s" % (conj_type, conj_subtype, verb_conj, render_headword_and_conj())) continue pattern = pattern.replace("opt-semi-depon", "optsemidepon") pattern = pattern.replace("semi-depon", "semidepon") pattern = pattern.replace("pass-3only", "pass3only") pattern = pattern.replace("pass-impers", "passimpers") pattern = pattern.replace("no-actv-perf", "noactvperf") pattern = pattern.replace("no-pasv-perf", "nopasvperf") pattern = pattern.replace("perf-as-pres", "perfaspres") pattern = pattern.replace("short-imp", "shortimp") pattern = pattern.replace("sup-futr-actv-only", "supfutractvonly") pattern = safe_split(pattern, "-") pattern = [ x for x in pattern if x not in ["noperf", "nosup", "irreg", "def", "facio", "shortimp", "depon"] ] subtypes = [ x for x in subtypes if x not in ["I", "noperf", "nosup", "irreg", "depon"] ] if len(lemma) > 0 and lemma[0].endswith("sum"): # This is added automatically by [[sum]] subtypes = [x for x in subtypes if x != "supfutractvonly"] if set(pattern) != set(subtypes): if set(subtypes) >= set(pattern) and ( set(subtypes) - set(pattern) <= { "nopass", "p3inf", "poetsyncperf", "optsyncperf", "alwayssyncperf" }): pagemsg( "Subtypes=%s of conjugation template have extra, ignorable subtypes %s compared with pattern=%s of headword template: %s" % (".".join(sorted(subtypes)), ".".join( sorted(list(set(subtypes) - set(pattern)))), ".".join( sorted(pattern)), render_headword_and_conj())) else: pagemsg( "WARNING: Conjugation template has subtypes=%s but headword template has pattern=%s, skipping: %s" % (".".join(sorted(subtypes)), ".".join( sorted(pattern)), render_headword_and_conj())) continue # Fetch remaining params from headword template headword_params = [] for param in la_verb_template.params: pname = unicode(param.name) if pname.strip() in [ "1", "2", "3", "4", "44", "conj", "c", "pattern" ] or re.search("^(head|inf|perf|sup)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) # Erase all params del la_verb_template.params[:] # Copy params from conj template for param in la_conj_template.params: pname = unicode(param.name) la_verb_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False) # Copy remaining params from headword template for name, value, showkey in headword_params: la_verb_template.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (orig_la_verb_template, unicode(la_verb_template))) notes.append("convert {{la-verb}} params to new style") subsections[k] = unicode(parsed) if not saw_a_template: pagemsg("WARNING: Saw no verb headword or conjugation templates") secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) need_ref_section = False for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) all_pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it": all_pronun_templates.append(t) saw_it_pr = False pronun_based_respellings = [] for t in parsed.filter_templates(): origt = unicode(t) def tmsg(txt): other_templates = [] for t in all_pronun_templates: thist = unicode(t) if thist != origt: other_templates.append(thist) pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" % ", ".join(other_templates) if len(other_templates) > 0 else "")) tn = tname(t) if tn == "it-pr": saw_it_pr = True respellings = blib.fetch_param_chain(t, "1") # FIXME, need to split on comma pronun_based_respellings.extend(respellings) break if tn == "IPA" and getparam(t, "1") == "it": saw_it_pr = True pronuns = blib.fetch_param_chain(t, "2") this_phonemic_pronun = None this_phonemic_respelling = None this_phonetic_pronun = None this_phonetic_respelling = None respellings = [] all_warnings = [] hack_respelling_warnings = [] main_warnings = [] unable = [False] for pronun in pronuns: respelling = ipa_to_respelling(pronun) respelling, this_hack_respelling_warnings = hack_respelling( pagetitle, respelling) hack_respelling_warnings.extend( this_hack_respelling_warnings) def set_unable(msg): main_warnings.append(msg) unable[0] = True tmsg("For pronun %s, generated respelling %s" % (pronun, respelling)) respelling_words = respelling.split(" ") for rw in respelling_words: if rw.endswith("-"): # prefix continue hacked_rw = re.sub( u".[\u0323\u0331]", "e", rw ) # pretend vowels with secondary or no stress are 'e' if not re.search( u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len( re.sub("[^aeiouAEIOU]", "", hacked_rw)) > 1: set_unable( "WARNING: For respelling %s for pronun %s, word %s is missing stress" % (respelling, pronun, rw)) if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$", respelling): set_unable( "WARNING: Strange char in respelling %s for pronun %s" % (respelling, pronun)) else: putative_pagetitle = re.sub( u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])", lambda m: vowel_respelling_to_spelling[m.group( 1)] + m.group(2), respelling) pagetitle_words = pagetitle.split(" ") putative_pagetitle_words = putative_pagetitle.split( " ") if len(pagetitle_words) != len( putative_pagetitle_words): set_unable( "WARNING: Page title has %s words but putative page title %s has %s words" % (len(pagetitle_words), putative_pagetitle, len(putative_pagetitle_words))) else: hacked_putative_pagetitle_words = [] for ptw, puptw in zip( pagetitle_words, putative_pagetitle_words): split_ptw = re.split("([Zz]+)", ptw) split_puptw = re.split( "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw) if len(split_ptw) != len(split_puptw): set_unable( "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s" % (ptw, puptw)) hacked_putative_pagetitle_words.append( puptw) else: parts = [] for i in xrange(len(split_puptw)): if i % 2 == 0: parts.append(split_puptw[i]) else: parts.append(split_ptw[i]) hacked_putative_pagetitle_words.append( "".join(parts)) putative_pagetitle = " ".join( hacked_putative_pagetitle_words) if putative_pagetitle != pagetitle: # If respelling already seen, we already warned about it. if respelling in respellings: assert unable[0] else: set_unable( "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)" % (respelling, putative_pagetitle, pronun)) def append_respelling(respelling): if respelling not in respellings: respellings.append(respelling) def append_warnings(warning): if warning: all_warnings.append(warning) for warning in hack_respelling_warnings: all_warnings.append(warning) del hack_respelling_warnings[:] for warning in main_warnings: all_warnings.append(warning) del main_warnings[:] append_respelling(respelling) if pronun.startswith("/"): if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling, pronun, respelling)) this_phonemic_pronun = pronun this_phonemic_respelling = respelling this_phonetic_pronun = None this_phonetic_respelling = None elif pronun.startswith("["): if this_phonemic_pronun is None: if this_phonetic_pronun is not None: unable[0] = True append_warnings( "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun" % (this_phonetic_pronun, this_phonetic_respelling, pronun, respelling)) else: append_warnings( "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun" % (pronun, respelling)) this_phonetic_pronun = pronun this_phonetic_respelling = respelling elif this_phonemic_respelling != respelling: unable[0] = True append_warnings( "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)" % (this_phonemic_respelling, this_phonemic_pronun, respelling, pronun)) else: if unable[0] and len(main_warnings) > 0: # `unable` could be set from a previous pronunciation but no main warnings this time around # because the previously generated warnings have already been appended to all_warnings. mesg = main_warnings[0] del main_warnings[0] append_warnings(mesg) else: append_warnings(None) this_phonemic_pronun = None this_phonemic_respelling = None else: unable[0] = True append_warnings( "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic" % (pronun, respelling)) if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling)) if not unable[0]: for param in t.params: pn = pname(param) if not re.search("^[0-9]+$", pn) and pn != "nocount": unable[0] = True append_warnings( "WARNING: Saw unrecognized param %s=%s" % (pn, unicode(param.value))) manual_assist = "" if unable[0]: if pagetitle in ipa_directives: respellings = ipa_directives[pagetitle] unable[0] = False manual_assist = " (manually assisted)" tmsg( "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", "s" if len(respellings) > 1 else "", ",".join(respellings), " ||| ".join(all_warnings))) else: tmsg("%s<respelling> %s <end> %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", " ".join(respellings), " ||| ".join(all_warnings))) if not unable[0]: del t.params[:] nextparam = 0 for param in respellings: if "=" in param: paramname, paramval = param.split("=", 1) else: nextparam += 1 paramname = str(nextparam) paramval = param if re.search("^n[0-9]*$", paramname): need_ref_section = True t.add(paramname, paramval) blib.set_template_name(t, "it-pr") notes.append( "replace raw {{IPA|it}} with {{it-pr|%s}}%s" % ("|".join(respellings), manual_assist)) pronun_based_respellings.extend(respellings) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) subsections[k] = unicode(parsed) rhymes_template = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it": if rhymes_template: pagemsg( "WARNING: Saw two {{rhymes|it}} templates: %s and %s" % (unicode(rhymes_template), unicode(t))) rhymes_template = t if rhymes_template: rhyme_based_respellings = [] all_warnings = [] def append_respelling(respelling): if respelling not in rhyme_based_respellings: rhyme_based_respellings.append(respelling) def append_warnings(warning): all_warnings.append(warning) rhymes = blib.fetch_param_chain(rhymes_template, "2") unable = False for rhy in rhymes: spellings = rhyme_to_spelling(rhy) matched = False bad_rhyme_msgs = [] for ending, ending_respelling in spellings: if pagetitle.endswith(ending): prevpart = pagetitle[:-len(ending)] respelling = prevpart + ending_respelling saw_oso_ese = False if ending_respelling == u"óso": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"ó[s]o") elif ending_respelling == u"ése": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"é[s]e") else: if respelling.endswith(u"zióne"): new_respelling = re.sub( u"zióne$", u"tsióne", respelling) pagemsg( "Replaced respelling '%s' with '%s'" % (respelling, new_respelling)) respelling = new_respelling prevpart = respelling[:-len( ending)] + ending_respelling append_respelling(respelling) if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)", prevpart.lower()) or not saw_oso_ese and re.search( u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]", ending_respelling.lower())): append_warnings( "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s" % rhy) unable = True break if "z" in prevpart: append_warnings( "WARNING: Unable to add pronunciation due to z in part before rhyme: %s" % rhy) unable = True break hacked_prevpart = re.sub("([gq])u", r"\1w", prevpart) hacked_prevpart = hacked_prevpart.replace( "gli", "gl") hacked_prevpart = re.sub("([cg])i", r"\1", hacked_prevpart) if re.search("[^aeiou][iu]([aeiou]|$)", hacked_prevpart.lower()): append_warnings( "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s" % rhy) unable = True break if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)", respelling.lower()): append_warnings( "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s" % rhy) unable = True break matched = True break else: bad_rhyme_msgs.append( "WARNING: Unable to match rhyme %s, spelling %s, respelling %s" % (rhy, ending, ending_respelling)) if not matched and not unable and bad_rhyme_msgs: for bad_rhyme_msg in bad_rhyme_msgs: pagemsg(bad_rhyme_msg) if rhyme_based_respellings: if not saw_it_pr: manual_assist = "" if pagetitle in rhyme_directives: rhyme_based_respellings = rhyme_directives[ pagetitle] manual_assist = " (manually assisted)" pagemsg( "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), " ||| ".join(all_warnings), unicode(rhymes_template))) subsections[k] = "* {{it-pr|%s}}\n" % ",".join( rhyme_based_respellings) + subsections[k] notes.append( "add Italian rhyme-based respelling%s %s%s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), manual_assist)) else: different_headers = [] for pos in [ "Noun", "Verb", "Adjective", "Adverb", "Participle" ]: if "==%s==" % pos in secbody: different_headers.append(pos) if len(different_headers) > 1: all_warnings[0:0] = [ "WARNING: Multiple headers %s seen" % ",".join(different_headers) ] if "Etymology 1" in secbody: all_warnings[0:0] = [ "WARNING: Multiple etymologies seen" ] pagemsg( "<respelling> all: %s <end>%s: <from> %s <to> %s <end>" % (" ".join(rhyme_based_respellings), " " + " ||| ".join(all_warnings) if all_warnings else "", unicode(rhymes_template), unicode(rhymes_template))) else: for respelling in rhyme_based_respellings: if (not re.search("^qual[0-9]*=", respelling) and pronun_based_respellings and respelling not in pronun_based_respellings): pagemsg( "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s" % (" (with problems)" if len(all_warnings) > 0 else "", respelling, ",".join(pronun_based_respellings), ": %s" % " ||| ".join(all_warnings) if len(all_warnings) > 0 else "")) if need_ref_section: for k in xrange(len(subsections) - 1, 2, -2): if re.search(r"^===\s*References\s*===$", subsections[k - 1].strip()): if not re.search(r"<references\s*/?\s*>", subsections[k]): subsections[k] = subsections[k].rstrip( "\n") + "\n<references />\n\n" notes.append( "add <references /> to existing ===References=== section for pronunciation refs" ) break else: # no break for k in xrange(len(subsections) - 1, 2, -2): if not re.search(r"==\s*(Anagrams|Further reading)\s*==", subsections[k - 1]): subsections[k + 1:k + 1] = [ "===References===\n", "<references />\n\n" ] notes.append( "add new ===References=== section for pronunciation refs" ) break else: # no break pagemsg( "WARNING: Something wrong, couldn't find location to insert ===References=== section" ) secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg): notes = [] def analyze_declts(declts, pagetitle, headword_gens, headword_pls): decl_genders_gens_and_pls = [] prev_is_weak = None prev_is_sg = None for declt in declts: def getp(param): return getparam(declt, param) tn = tname(declt) gender = re.sub(".*-", "", tn) if gender == "pl": gender = "p" decl_gens = [] decl_pls = [] if gender != "p": is_weak = False is_sg = False for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]: if getp(param): pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt))) return None if gender in ["m", "n"]: arg1 = getp("1") if not arg1: gen = "" elif arg1 in ["n", "ns", "en", "ens"]: is_weak = True gen = arg1 elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]: gen = arg1 else: pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt))) return None decl_gens = convert_gens(pagetitle, [gen], from_decl=True) num = getp("n") if num == "sg": is_sg = True elif num not in ["full", ""]: pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt))) return None if not is_sg: if gender == "f": plsuffix = getp("1") else: plsuffix = getp("2") argpl = getp("pl") if argpl: pl = argpl else: pl = pagetitle + plsuffix if pl == "-": is_sg = True else: decl_pls = normalize_values([pl]) if prev_is_weak is not None and prev_is_weak != is_weak: pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s" % (is_weak, prev_is_weak, declts_to_unicode(declts))) return None prev_is_weak = is_weak if prev_is_sg is not None and prev_is_sg != is_sg: pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s" % (is_sg, prev_is_sg, declts_to_unicode(declts))) return None prev_is_sg = is_sg decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls)) all_decl_genders = [] all_decl_gens = [] all_decl_pls = [] for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls: if decl_gender not in all_decl_genders: all_decl_genders.append(decl_gender) for decl_gen in decl_gens: if decl_gen not in all_decl_gens: all_decl_gens.append(decl_gen) for decl_pl in decl_pls: if decl_pl not in all_decl_pls: all_decl_pls.append(decl_pl) first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0] if len(all_decl_genders) > 1 and ( len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls) ): pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s" % declts_to_unicode(declts)) return None if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls): pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s" % declts_to_unicode(declts)) return None is_weak = prev_is_weak is_sg = prev_is_sg declspec = ":".join(all_decl_genders) def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc): defparts = [] for gender in all_decl_genders: defpart = pagetitle + get_default_part(pagetitle, gender, is_weak) if defpart not in defparts: defparts.append(defpart) if all_decl_parts == defparts: declspec += "," else: all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None) if set(headword_parts) == set(all_decl_parts): headword_part_forms = analyze_forms(pagetitle, headword_parts, None) if headword_part_forms != all_decl_part_forms: pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms, declts_to_unicode(declts))) all_decl_part_forms = headword_part_forms else: pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts))) declspec += ",%s" % all_decl_part_forms return declspec if "m" in all_decl_genders or "n" in all_decl_genders: declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive") if "p" not in all_decl_genders: declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural") declspec = re.sub(",*$", "", declspec) if is_weak: declspec += ".weak" if is_sg: declspec += ".sg" if ss: declspec += ".ss" return declspec, all_decl_genders, all_decl_gens, all_decl_pls old_style_headt = False for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]: if getparam(headt, param): old_style_headt = True break if not old_style_headt: pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt), declts and ", declts=%s" % declts_to_unicode(declts) or "")) return notes is_proper = tname(headt) == "de-proper noun" ss = False if declts: sses = [not not getparam(declt, "ss") for declt in declts] if len(set(sses)) > 1: pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts)) return ss = list(set(sses)) == [True] if ss: if not pagetitle.endswith(u"ß"): pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts)) return # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the # headword and declension specs. pagetitle = re.sub(u"ß$", "ss", pagetitle) adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts) genders = blib.fetch_param_chain(headt, "1", "g") headword_genders = genders gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True)) pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl")) dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim")) fems = normalize_values(blib.fetch_param_chain(headt, "f")) mascs = normalize_values(blib.fetch_param_chain(headt, "m")) if gens == [True]: gens = [] for param in headt.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and ( not adjectival or pn not in "head"): pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt))) return if not genders: pagemsg("WARNING: No genders in head template: %s" % unicode(headt)) return if "p" in genders and len(genders) > 1: pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt)) return if "p" in genders and (gens or pls): pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt)) return saw_mn = "m" in genders or "n" in genders if not saw_mn and not adjectival: if gens and gens == [pagetitle]: gens = [] if gens: pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt)) return if adjectival: if len(declts) > 1: pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s" % declts_to_unicode(declts)) return declt = declts[0] def getp(param): return getparam(declt, param) tn = tname(declt) m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn) if m: default_equiv = None is_sg, gender = m.groups() adj = getp("1") noun = getp("2") if gender in ["m", "f"]: default_equiv = adj + ("e" if gender == "m" else "er") if noun: default_equiv += " " + construct_default_equiv(noun, gender) if gender in ["m", "n"]: noun_gen = getp("3") noun_pl = getp("4") else: noun_gen = "-" noun_pl = getp("3") noun_pl_full = getp("pl") adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es" expected_lemma = adj + adj_ending if gender == "f": # Should be '-er' but we often see '-en' (weak form) instead expected_gens = [adj + "er", adj + "en"] else: expected_gens = [adj + "en"] if is_sg: expected_pls = [] else: expected_pls = [adj + "e", adj + "en"] if not noun: if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"): pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt)) return all_decl_genders = [gender] else: fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "") fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0] def analyze_headword_parts_for_noun(parts, desc): noun_headword_parts = [] for part in parts: m = re.search("^([^ ]+) ([^ ]+)$", part.strip()) if not m: pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s" % (desc, part, unicode(headt), unicode(declt))) return [] part_adj, part_noun = m.groups() noun_headword_parts.append(part_noun) return noun_headword_parts noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive") noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural") retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval expected_lemma = "%s %s" % (expected_lemma, noun) expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)] if is_sg: expected_pls = [] else: expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls] if pagetitle != expected_lemma: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s" % (expected_lemma, pagetitle, unicode(headt), unicode(declt))) return if set(genders) != set(all_decl_genders): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt))) return if not (set(gens) <= set(expected_gens)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt))) return if pls == ["-"]: if expected_pls: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return elif not (set(pls) <= set(expected_pls)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return if not noun: declspec = "+" if is_sg: declspec += ".sg" else: if re.search("^" + CAP, adj): adj_lemma = adj.lower() else: adj_lemma = adj if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]: adj_lemma += "e" adj_form = adj + adj_ending if adj_form.startswith(adj_lemma): adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):]) else: adj_link = "[[%s|%s]]" % (adj_lemma, adj_form) noun_link = "[[%s]]" % noun # This is less accurate than the above. Often head= is wrong. # Try to update adjective and noun links from head= if given. #head = getparam(headt, "head") #if head: # m = re.search("^([^ ]*) ([^ ]*)$", head) # if not m: # pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s" # % (head, unicode(headt), unicode(declt))) # else: # head_adj_link, head_noun_link = m.groups() # m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link) # if m: # adj_link_lemma, adj_link_form = m.groups() # if adj_link_form.startswith(adj_link_lemma): # head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):]) # if head_adj_link != adj_link: # pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s" # % (head_adj_link, adj_link, unicode(headt), unicode(declt))) # adj_link = head_adj_link # if head_noun_link != noun_link: # pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s" # % (head_noun_link, noun_link, unicode(headt), unicode(declt))) # noun_link = head_noun_link declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec) headspec = declspec is_both = is_proper and not is_sg else: pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts)) return else: # not adjectival if len(genders) == 1 and genders[0] in ["m", "f"]: default_equiv = construct_default_equiv(pagetitle, genders[0]) headspec = ":".join(genders) is_sg = False is_both = False is_weak = False headword_gens = [] headword_pls = [] if headspec != "p": pls = convert_pls(pagetitle, pls, is_proper=is_proper) headword_pls = pls if saw_mn: gens = convert_gens(pagetitle, gens) headword_gens = gens if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"]) and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))): is_weak = True def_gens = [] for gender in genders: def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak) if def_gen not in def_gens: def_gens.append(def_gen) if set(def_gens) == set(gens): headspec += "," else: headspec += ",%s" % analyze_forms(pagetitle, gens, None) def_pls = [] for gender in genders: def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak) if def_pl not in def_pls: def_pls.append(def_pl) if set(def_pls) == set(pls): headspec += "," if is_proper: is_both = True elif pls == ["-"]: is_sg = True else: headspec += ",%s" % analyze_forms(pagetitle, pls, None) headspec = re.sub(",*$", "", headspec) if is_weak: headspec += ".weak" if is_sg: headspec += ".sg" if ss: headspec += ".ss" extraspec = "" if dims: extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",") if fems: extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",") if mascs: extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",") if declts and not adjectival: retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval if headspec != declspec: if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls): if set(all_decl_genders) == set(headword_genders): pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt))) declspec = headspec else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return if is_proper: headspec = headspec.replace(".sg", "") if is_both: if ".ss" in headspec: headspec = headspec.replace(".ss", ".both.ss") else: headspec += ".both" newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec) headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt) outmsg = "Would " + headt_outmsg if declts: newdeclt = "{{de-ndecl|%s}}" % declspec declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt) outmsg += " and " + declt_outmsg pagemsg(outmsg) if unicode(headt) != newheadt: newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(headt_outmsg) subsections[subsection_with_head] = newsectext if declts: declts_existing = "\n".join(unicode(declt) for declt in declts) newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(declt_outmsg) subsections[subsection_with_declts] = newsectext return notes
def process_line(index, line, online): global args line = line.strip() m = re.search( r"^Page [0-9]+ (.*?): WARNING: Saw noun headword template.*: (\{\{la-(?:proper )?noun\|.*?\}\})$", line) if not m: msg("Unrecognized line, skipping: %s" % line) return pagetitle, noun_headword_template = m.groups() def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) t = blib.parse_text(noun_headword_template).filter_templates()[0] if getparam(t, "indecl"): pagemsg("Skipping indeclinable noun: %s" % unicode(t)) return lemma = blib.fetch_param_chain(t, ["1", "head", "head1"], "head") or [pagetitle] genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen") noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g") noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl") if " " in lemma[0]: pagemsg("WARNING: Space in lemma %s, skipping: %s" % (lemma[0], unicode(t))) return if len(lemma) > 1: pagemsg("WARNING: Multiple lemmas %s, skipping: %s" % (",".join(lemma), unicode(t))) return lemma = lemma[0] noun_decl_to_decl_type = { "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5", "irregular": "irreg", } if len(noun_decl) == 0: pagemsg("WARNING: No declension, skipping: %s" % unicode(t)) return if len(noun_decl) > 1: pagemsg("WARNING: Multiple decls %s, skipping: %s" % (",".join(noun_decl), unicode(t))) return noun_decl = noun_decl[0] if noun_decl not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized declension %s, skipping: %s" % (noun_decl, unicode(t))) return decl_type = noun_decl_to_decl_type[noun_decl] if decl_type in ["1", "2", "4", "5"]: la_ndecl = "{{la-ndecl|%s<%s>}}" % (lemma, decl_type) elif decl_type == "3": if len(genitive) == 0: pagemsg( "WARNING: No genitives with decl 3 lemma %s, skipping: %s" % (lemma, unicode(t))) return elif len(genitive) > 1: pagemsg( "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s" % (",".join(genitive), lemma, unicode(t))) return else: gen1 = genitive[0] if gen1.endswith("is"): stem = gen1[:-2] if lalib.infer_3rd_decl_stem(lemma) == stem: la_ndecl = "{{la-ndecl|%s<3>}}" % lemma else: la_ndecl = "{{la-ndecl|%s/%s<3>}}" % (lemma, stem) elif gen1.endswith("ium"): if lemma.endswith("ia"): la_ndecl = "{{la-ndecl|%s<3.pl>}}" % lemma elif lemma.endswith(u"ēs"): la_ndecl = "{{la-ndecl|%s<3.I.pl>}}" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s" % (lemma, unicode(t))) return elif gen1.endswith("um"): if lemma.endswith("a") or lemma.endswith(u"ēs"): la_ndecl = "{{la-ndecl|%s<3.pl>}}" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s" % (lemma, unicode(t))) return else: pagemsg( "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s" % (gen1, lemma, unicode(t))) return elif decl_type == "irreg": pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" % unicode(t)) return else: pagemsg( "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s" % (decl_type, unicode(t))) return pagemsg("For noun %s, declension %s" % (unicode(t), la_ndecl)) if online: noun_props = convert_la_headword_noun.new_generate_noun_forms( la_ndecl, errandpagemsg, expand_text) if noun_props is None: return convert_la_headword_noun.compare_headword_decl_forms( "genitive", genitive, ["gen_sg", "gen_pl"], noun_props, "headword=%s, decl=%s" % (unicode(t), la_ndecl), pagemsg, adjust_for_missing_gen_forms=True, remove_headword_links=True)
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = blib.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = blib.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword: del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runounlib.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runounlib.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None): if not expand_text: def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, False) tn = tname(t) if tn in [ "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det", "la-pronoun" ]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: retval = getparam(t, "1") if "<" in retval or "((" in retval or " " in retval or "-" in retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-adj-forms") blib.remove_param_chain(generate_template, "comp", "comp") blib.remove_param_chain(generate_template, "sup", "sup") blib.remove_param_chain(generate_template, "adv", "adv") blib.remove_param_chain(generate_template, "lemma", "lemma") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg_m" in args: retval = args["linked_nom_sg_m"] elif "linked_nom_pl_m" in args: retval = args["linked_nom_pl_m"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") else: retval = re.sub("/.*", "", retval) elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it. rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) if "linked_nom_sg" in args: retval = args["linked_nom_sg"] elif "linked_nom_pl" in args: retval = args["linked_nom_pl"] else: pagemsg( "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in ["la-verb", "la-suffix-verb"]: retval = blib.fetch_param_chain(t, "lemma", "lemma") if not retval: generate_template = blib.parse_text( unicode(t)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-verb-forms") rmparam(generate_template, "id") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") retval = "" else: args = blib.split_generate_args(result) for slot in [ "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc", "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc" ]: if slot in args: retval = args[slot] break else: # no break pagemsg( "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s" % (unicode(generate_template), result)) retval = "" retval = retval.split(",") elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or ( tn in ["la-suffix", "la-suffix-adv", "la-gerund"]): retval = getparam(t, "1") elif tn == "la-letter": retval = pagename elif tn in ["head", "la-prep"]: retval = blib.fetch_param_chain(t, "head", "head") elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates: retval = blib.fetch_param_chain(t, "1", "head") else: pagemsg("WARNING: Unrecognized headword template %s" % unicode(t)) retval = "" retval = retval or pagename if type(retval) is not list: retval = [retval] return retval
def handle_mf(mf, mf_full, make_mf): mfs = blib.fetch_param_chain(t, mf, mf) mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl") if mfs and not any(x.startswith("+") for x in mfs): defmf = make_mf(lemma) if set(mfs) == {defmf}: defpls = make_plural(defmf) ok = False if not mfpls or set(mfpls) == set(defpls): ok = True elif set(mfpls) < set(defpls): pagemsg( "WARNING: %pl=%s subset of default=%s, allowing" % (mf, ",".join(mfpls), ",".join(defpls))) ok = True if ok: notes.append( "replace %s=%s with '+' in {{es-noun}}" % (mf, ",".join(mfs))) blib.set_param_chain(t, ["+"], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") return actual_special = None for special in all_specials: special_mf = make_mf(lemma, special) if special_mf is None: continue if mfs == [special_mf]: pagemsg("Found special=%s with special_mf=%s" % (special, special_mf)) actual_special = special break if actual_special: if not mfpls: pagemsg( "WARNING: Explicit %s=%s matches special=%s but no %s plural" % (mf, ",".join(mfs), actual_special, mf_full)) else: special_mfpl = make_plural(special_mf, actual_special) if special_mfpl: if len(special_mfpl) > 1 and set(mfpls) < set( special_mfpl): pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) elif set(mfpls) == set(special_mfpl): pagemsg( "Found %s=%s and special=%s, %spls=%s matches special_%spl" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf)) else: pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) actual_special = None if actual_special: notes.append( "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural" % (mf_full, ",".join(mfs), actual_special, mf_full)) blib.set_param_chain(t, ["+%s" % actual_special], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") if not actual_special: defmf = make_mf(lemma) mfs_with_def = ["+" if x == defmf else x for x in mfs] if mfs_with_def != mfs: notes.append( "replace default %s %s with '+' in {{es-noun}}" % (mf_full, defmf)) blib.set_param_chain(t, mfs_with_def, mf, mf) if mfpls: defpl = [ x for y in mfs for x in (make_plural(y) or []) ] ok = False if set(defpl) == set(mfpls): ok = True elif len(defpl) > 1 and set(mfpls) < set(defpl): pagemsg( "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing" % (mf, ",".join(mfs), mf, ",".join(mfpls), ",".join(defpl))) ok = True if ok: pagemsg( "Found %s=%s, %spl=%s matches default pl" % (mf, ",".join(mfs), mf, ",".join(mfpls))) notes.append( "remove redundant explicit %s plural %s in {{es-noun}}" % (mf_full, ",".join(mfpls))) blib.remove_param_chain( t, mf + "pl", mf + "pl") else: for special in all_specials: defpl = [ x for y in mfs for x in ( make_plural(y, special) or []) ] if set(defpl) == set(mfpls): pagemsg( "Found %s=%s, %spl=%s matches special=%s" % (mf, ",".join(mfs), mf, ",".join(mfpls), special)) notes.append( "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}" % (mf_full, ",".join(mfpls), special)) blib.set_param_chain( t, ["+%s" % special], mf + "pl", mf + "pl")
return True def compare_genders(g1, g2): if set(g1) == set(g2): return True if len(g1) == 1 and len(g2) == 1: # If genders don't match exactly, check if existing gender is missing # animacy and allow that, so it gets overwritten with new gender if g1[0] == re.sub("-(an|in)", "", g2[0]): pagemsg( "Existing gender %s missing animacy spec compared with proposed %s, allowed" % (",".join(g1), ",".join(g2))) return True return None headwords = blib.fetch_param_chain(headword_template, "1", "head", subpagetitle) translits = blib.fetch_param_chain(headword_template, "tr", "tr") for i in xrange(len(translits)): if len(headwords) <= i: pagemsg( "WARNING: Not enough headwords for translit tr%s=%s, skipping" % ("" if i == 0 else str(i + 1), translits[i])) return None else: headwords[i] += "//" + translits[i] genitives = blib.fetch_param_chain(headword_template, "3", "gen") plurals = blib.fetch_param_chain(headword_template, "4", "pl") genders = blib.fetch_param_chain(headword_template, "2", "g") cases_to_check = None if args["n"] == "s": if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True)
def process_page(page, index, fixdirecs): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] saw_paired_verb = False for t in parsed.filter_templates(): if unicode(t.name) == "ru-verb": saw_paired_verb = False if getparam(t, "2") in ["impf", "both"]: verb = getparam(t, "1") or pagetitle pfs = blib.fetch_param_chain(t, "pf", "pf") impfs = blib.fetch_param_chain(t, "impf", "impf") for otheraspect in pfs + impfs: if verb[0:2] == otheraspect[0:2]: saw_paired_verb = True if (unicode(t.name) in ["ru-conj", "ru-conj-old"] and getparam(t, "1") == "impf" and not saw_paired_verb): if getparam(t, "ppp") or getparam(t, "past_pasv_part"): pass elif [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) pass elif re.search(r"\+p|\[?\([78]\)\]?", getparam(t, "2")): pass else: pagemsg( "Apparent unpaired transitive imperfective without PPP") if pagetitle in fixdirecs: direc = fixdirecs[pagetitle] assert direc in [ "fixed", "paired", "intrans", "+p", "|ppp=-" ] origt = unicode(t) if direc == "+p": t.add("2", getparam(t, "2") + "+p") notes.append( "add missing past passive participle to transitive unpaired imperfective verb" ) pagemsg("Add missing PPP, replace %s with %s" % (origt, unicode(t))) elif direc == "|ppp=-": t.add("ppp", "-") notes.append( "note transitive unpaired imperfective verb as lacking past passive participle" ) pagemsg("Note no PPP, replace %s with %s" % (origt, unicode(t))) elif direc == "paired": pagemsg("Verb actually is paired") elif direc == "fixed": pagemsg("WARNING: Unfixed verb marked as fixed") elif direc == "intrans": pagemsg("WARNING: Transitive verb marked as intrans") return unicode(parsed), notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc): notes = [] pagetitle = unicode(nounpage.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(nounpage.text) retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % ( desc, ",".join(new_adj_or_dims))) return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) head = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]: if head: pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" % (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims))) return head = t if not head: pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims))) return orig_adjs_or_dims = blib.fetch_param_chain(head, param, param) adjs_or_dims = blib.fetch_param_chain(head, param, param) added_adjs_or_dims = [] for adj_or_dim in new_adj_or_dims: if adj_or_dim in adjs_or_dims: pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head))) else: adjs_or_dims.append(adj_or_dim) added_adjs_or_dims.append(adj_or_dim) if adjs_or_dims != orig_adjs_or_dims: orighead = unicode(head) blib.set_param_chain(head, adjs_or_dims, param, param) pagemsg("Replaced %s with %s" % (orighead, unicode(head))) notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims))) secbody = unicode(parsed) subsecs = re.split("(^==.*==\n)", secbody, 0, re.M) for k in xrange(2, len(subsecs), 2): if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]: header = re.sub("=", "", subsecs[k - 1]).strip() for adj_or_dim in adjs_or_dims: def note_removed_text(m): if m.group(1): pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" % (header, desc, adj_or_dim, m.group(0))) return "" newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k]) if newsubsecsk != subsecs[k]: notes.append("remove %s %s from %s" % (desc, adj_or_dim, header)) subsecs[k] = newsubsecsk subsecs[k] = re.sub(", *,", ",", subsecs[k]) # Repeat in case adjacent terms removed (unlikely though). subsecs[k] = re.sub(", *,", ",", subsecs[k]) subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M) if re.search(r"^\s*$", subsecs[k]): subsecs[k] = "" subsecs[k - 1] = "" secbody = "".join(subsecs) secj = secbody + sectail newsecj = re.sub(r"\n\n\n+", "\n\n", secj) if newsecj != secj and not notes: notes.append("eliminate sequences of 3 or more newlines") secj = newsecj sections[j] = secj return "".join(sections), notes
def process_page(index, page, save, verbose, adverbs, all_derived_lemmas): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") # ending and whether final consonant is palatal endings = [ (u"ывать", False), (u"ивать", False), (u"ать", False), (u"ять", True), (u"еть", True), (u"ить", True), (u"нуть", False), (u"ия", True), (u"ие", True), (u"я", True), (u"е", True), (u"ь", True), (u"и", True), (u"а", False), (u"о", False), (u"ы", False), (u"ый", False), (u"ий", True), (u"ой", False), ] stems = [] for ending, is_palatal in endings: if pagetitle.endswith(ending): stem = re.sub(ending + "$", "", pagetitle) stems.append((stem, is_palatal)) if not stems: stems.append((pagetitle, False)) possible = [] def append_possible(stem_to_try, suffix): possible.append((stem_to_try.lower() + suffix, suffix)) # Try -ный/-ной, -ка, -ко for stem, palatal in stems: stems_to_try = [] def frob(stem): stem = first_palatalization(stem) if stem.endswith(u"л"): stem += u"ь" if re.search("[" + rulib.vowel + "]$", stem): stem += u"й" return stem to_try_1 = frob(stem) to_try_2 = rulib.dereduce_stem(stem, False) if to_try_2: to_try_2 = frob(rulib.remove_accents(to_try_2)) to_try_3 = rulib.dereduce_stem(stem, True) if to_try_3: to_try_3 = frob(rulib.remove_accents(to_try_3)) stems_to_try.append(to_try_1) if to_try_2: stems_to_try.append(to_try_2) if to_try_3 and to_try_3 != to_try_2: stems_to_try.append(to_try_3) for stem_to_try in stems_to_try: append_possible(stem_to_try, u"ный") append_possible(stem_to_try, u"ной") append_possible(stem_to_try, u"ский") append_possible(stem_to_try, u"ской") append_possible(stem_to_try, u"ник") append_possible(stem_to_try, u"чик") append_possible(stem_to_try, u"щик") append_possible(stem_to_try, u"ка") append_possible(stem_to_try, u"ко") append_possible(stem_to_try, u"ство") # Try -овый/-евый/-ёвый/-овой/-евой, -ик, -ок/-ек/-ёк for stem, palatal in stems: stems_to_try = [] stems_to_try.append(stem) reduced = rulib.reduce_stem(stem) if reduced: stems_to_try.append(reduced) for stem_to_try in stems_to_try: if stem_to_try.endswith(u"й"): stem_to_try = stem_to_try[:-1] append_possible(stem_to_try, u"овый") append_possible(stem_to_try, u"евый") append_possible(stem_to_try, u"ёвый") append_possible(stem_to_try, u"овой") append_possible(stem_to_try, u"евой") stem_to_try = first_palatalization(stem_to_try) append_possible(stem_to_try, u"еский") append_possible(stem_to_try, u"ический") append_possible(stem_to_try, u"ество") append_possible(stem_to_try, u"ик") append_possible(stem_to_try, u"ок") append_possible(stem_to_try, u"ек") append_possible(stem_to_try, u"ёк") append_possible(stem_to_try, u"ец") # If derived adverbs, try -о, -е, -и if adverbs: for stem, palatal in stems: stems_to_try = [] stems_to_try.append(stem) for stem_to_try in stems_to_try: append_possible(stem_to_try, u"о") append_possible(stem_to_try, u"е") append_possible(stem_to_try, u"и") would_output = False for possible_derived, suffix in possible: if possible_derived in all_derived_lemmas: would_output = True if not would_output: return text = unicode(page.text) if rulib.check_for_alt_yo_terms(text, pagemsg): return base_lemmas = [] for possible_derived, suffix in possible: if possible_derived in all_derived_lemmas: derived_section = blib.find_lang_section(possible_derived, "Russian", pagemsg, errandpagemsg) if not derived_section: errandpagemsg( "WARNING: Couldn't find Russian section for derived term %s" % possible_derived) continue if "==Etymology" in derived_section: pagemsg( "Skipping derived term %s because it already has an etymology" % possible_derived) continue derived_defns = rulib.find_defns(derived_section) if not derived_defns: errandpagemsg( "WARNING: Couldn't find definitions for derived term %s" % possible_derived) continue derived_parsed = blib.parse_text(derived_section) derived_lemmas = find_noun_lemmas( derived_parsed, possible_derived, errandpagemsg, lambda tempcall: blib.expand_text( tempcall, possible_derived, pagemsg, verbose)) for t in derived_parsed.filter_templates(): if tname(t) in ["ru-adj", "ru-adv"]: lemmas = blib.fetch_param_chain(t, "1", "head", possible_derived) trs = blib.fetch_param_chain(t, "tr", "tr") if trs: lemmas = [ "%s//%s" % (lemma, tr) for lemma, tr in zip(lemmas, trs) ] for lemma in lemmas: add_if_not(derived_lemmas, lemma) if not derived_lemmas: errandpagemsg("WARNING: No derived term lemmas for %s" % possible_derived) return if not base_lemmas: base_parsed = blib.parse_text(text) base_lemmas = find_noun_lemmas(base_parsed, pagetitle, errandpagemsg, expand_text) for t in base_parsed.filter_templates(): if tname(t) in ["ru-verb", "ru-adj"]: lemmas = blib.fetch_param_chain( t, "1", "head", pagetitle) trs = blib.fetch_param_chain(t, "tr", "tr") if trs: lemmas = [ "%s//%s" % (lemma, tr) for lemma, tr in zip(lemmas, trs) ] for lemma in lemmas: add_if_not(base_lemmas, lemma) if not base_lemmas: errandpagemsg("WARNING: No base lemmas") return base_lemmas = [ rulib.remove_monosyllabic_accents(x) for x in base_lemmas ] warnings = [] if len(base_lemmas) > 1: warnings.append("multiple-lemmas") if any("//" in lemma for lemma in base_lemmas): warnings.append("translit-in-lemma") base_section = blib.find_lang_section_from_text( text, "Russian", pagemsg) if not base_section: errandpagemsg( "WARNING: Couldn't find Russian section for base") return base_defns = rulib.find_defns(base_section) if not base_defns: errandpagemsg( "WARNING: Couldn't find definitions for base") return def concat_defns(defns): return ";".join(defns).replace("_", r"\u").replace(" ", "_") suffixes_with_stress = [] for suf in [ suffix, rulib.make_beginning_stressed_ru(suffix), rulib.make_ending_stressed_ru(suffix) ]: for derived_lemma in derived_lemmas: if derived_lemma.endswith(suf): add_if_not(suffixes_with_stress, suf) msg("%s %s+-%s%s no-etym possible-suffixed %s //// %s" % (",".join(derived_lemmas), ",".join(base_lemmas), ",".join(suffixes_with_stress), " WARNING:%s" % ",".join(warnings) if warnings else "", concat_defns(base_defns), concat_defns(derived_defns)))
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword and (not lemmas or pagetitle in lemmas): del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runoun.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runoun.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse_text(text) heads = None plurale_tantum = False animacy = "unknown" gender = "unknown" for t in parsed.filter_templates(): tn = tname(t) if tn in ["be-noun", "be-proper noun"]: heads = blib.fetch_param_chain(t, "1", "head") gender_and_animacy = blib.fetch_param_chain(t, "2", "g") plurale_tantum = False animacy = [] gender = [] if gender_and_animacy: for ga in gender_and_animacy: gender_and_animacy_parts = ga.split("-") g = gender_and_animacy_parts[0] if g not in gender: gender.append(g) if len(gender_and_animacy_parts) > 1: a = gender_and_animacy_parts[1] if a not in animacy: animacy.append(a) if len(gender_and_animacy_parts ) > 2 and gender_and_animacy_parts[2] == "p": plurale_tantum = True if not animacy: animacy = "unknown" elif len(animacy) > 1: pagemsg("WARNING: Multiple animacies: %s" % ",".join(animacy)) animacy = animacy[0] if not gender: gender = "unknown" elif set(gender) == {"m", "f"}: gender = "MF" else: if len(gender) > 1: pagemsg("WARNING: Multiple genders: %s" % ",".join(gender)) gender = gender[0] if gender in ["m", "f", "n"]: gender = gender.upper() else: pagemsg("WARNING: Unknown gender: %s" % gender) gender = "unknown" def fetch(param): val = getparam(t, param).strip() val = blib.remove_links(val) vals = re.split(r",\s*", val) retval = [] for v in vals: # Remove final footnote symbols are per [[Module:table tools]] v = re.sub( ur"[*~@#$%^&+0-9_\u00A1-\u00BF\u00D7\u00F7\u2010-\u2027\u2030-\u205E\u2070-\u20CF\u2100-\u2B5F\u2E00-\u2E3F]*$", "", v) v = be.mark_stressed_vowels_in_unstressed_syllables(v, pagemsg) retval.append(be.add_monosyllabic_accent(v)) return ", ".join(retval) def matches(is_end_stressed, should_be_end_stressed): return (is_end_stressed == "mixed" or should_be_end_stressed is None or is_end_stressed == should_be_end_stressed) def fetch_endings(param, endings): paramval = fetch(param) values = re.split(", *", paramval) found_endings = [] for v in values: v = v.replace(be.AC, "") for ending in endings: if v.endswith(ending): found_endings.append(ending) break else: # no break pagemsg( "WARNING: Couldn't recognize ending for %s=%s: %s" % (param, paramval, unicode(t))) return ":".join(found_endings) def canon(val): values = re.split(", *", val) return "/".join( be.undo_mark_stressed_vowels_in_unstressed_syllables(v) for v in values) def stress(endstressed): return ("endstressed" if endstressed == True else "stemstressed" if endstressed == False else "mixed") def check_multi_stressed(maxparam): for i in xrange(1, maxparam + 1): val = getparam(t, str(i)) vals = re.split(r",\s*", val) for v in vals: if be.is_multi_stressed(v): pagemsg( "WARNING: Param %s=%s has multiple stresses: %s" % ((str(i), val, unicode(t)))) if be.needs_accents(v): pagemsg("WARNING: Param %s=%s has missing stress: %s" % ((str(i), val, unicode(t)))) def ins_sg_note(ins_sg): if re.search(u"[чшжрць]$", heads[0]) and gender == "f": return "ins_sg=%s " % canon(ins_sg) else: return "" def truncate_extra_forms(form): return re.sub(",.*", "", form) def infer_animacy(nom_pl, gen_pl, acc_pl): nom_pl_vals = set(nom_pl.split(", ")) gen_pl_vals = set(gen_pl.split(", ")) acc_pl_vals = set(acc_pl.split(", ")) if acc_pl_vals == nom_pl_vals: return "in" elif acc_pl_vals == gen_pl_vals: return "an" else: pagemsg( "WARNING: Can't infer animacy: nom_pl=%s, gen_pl=%s, acc_pl=%s" % (nom_pl, gen_pl, acc_pl)) return "unknown" def infer_gender(lemma): if re.search(u"[оеё]́?$", lemma) or re.search(u"мя́?$", lemma): return "N" elif re.search(u"[цс]тва$", lemma): return "N" elif re.search(u"[ая]́?$", lemma) or re.search(u"асць$", lemma): return "F" elif re.search(u"ь$", lemma): return None elif re.search(be.cons_c + "$", lemma): return "M" else: pagemsg("WARNING: Unrecognized lemma ending: %s" % lemma) return None def default_stress(lemma, gender, reducible): if re.search(u"я́$", lemma) and gender == "N": return "b" elif re.search(AC + "$", lemma): return "d" elif "*" in reducible and re.search( u"[еоэаё]́" + be.cons_c + u"ь?$", lemma): return "b" else: return "a" def infer_alternations(nom_sg, nom_pl): nom_sg = truncate_extra_forms(nom_sg) nom_pl = truncate_extra_forms(nom_pl) if re.search(u"^.*[аяеёо]́$", nom_sg): m = re.search(u"^(.*)[ыіая]$", nom_pl) if m: pl_stem = m.group(1) for valt in possible_vowel_alternations: valt_nom_sg = be.apply_vowel_alternation(nom_sg, valt) if valt_nom_sg: valt_nom_sg = re.sub(u"[аяеёо]́$", "", valt_nom_sg) valt_nom_sg = be.maybe_accent_final_syllable( valt_nom_sg) valt_nom_sg = be.destress_vowels_after_stress_movement( valt_nom_sg) if valt_nom_sg == be.undo_mark_stressed_vowels_in_unstressed_syllables( pl_stem): return valt m = re.search(u"^(.*" + be.cons_c + u")ь?$", nom_sg) if m: nom_sg = m.group(1) nom_sg = re.sub(u"й$", "", nom_sg) if re.search( u"я" + be.cons_c + "*" + be.vowel_c + AC + be.cons_c + "*$", nom_sg): nom_sg = be.apply_vowel_alternation(nom_sg, "ae") m = re.search(u"^.*([ыіая]́)$", nom_pl) if m: nom_sg = be.remove_accents(nom_sg) + m.group(1) nom_sg = be.destress_vowels_after_stress_movement( nom_sg) if nom_sg == be.undo_mark_stressed_vowels_in_unstressed_syllables( nom_pl): return "ae" return None def vowel_stem_from_vowel_ending_nom_sg(nom_sg): m = re.search(u"^(.*)[аяеоё]́?$", nom_sg) assert m vowel_stem = m.group(1) if re.search(be.vowel_c + AC + "?$", vowel_stem): vowel_stem += u"й" return vowel_stem def compare_stems(marked_stem, unmarked_stem): return (be.destress_vowels_after_stress_movement(marked_stem) == be.undo_mark_stressed_vowels_in_unstressed_syllables( unmarked_stem)) def infer_reducible(nom_sg, gen_sg, gen_pl, gender, seen_patterns): if len(seen_patterns) > 1: pagemsg( "WARNING: Multiple patterns %s, not inferring reducible" % ",".join(seen_patterns)) return None if len(seen_patterns) == 0: pagemsg("WARNING: No patterns, not inferring reducible") return None seen_pattern = seen_patterns[0] nom_sg = truncate_extra_forms(nom_sg) gen_sg = truncate_extra_forms(gen_sg) gen_pls = gen_pl and re.split(", *", gen_pl) or [] if re.search(u"[аяеоё]́?$", nom_sg): epenthetic_stress = seen_pattern in ["b", "c", "e", "f"] vowel_stem = vowel_stem_from_vowel_ending_nom_sg(nom_sg) if seen_pattern in ["b", "d"]: vowel_stem = be.maybe_accent_final_syllable(vowel_stem) else: vowel_stem = be.maybe_accent_initial_syllable(vowel_stem) retvals = [] for gen_pl in gen_pls: nonvowel_stem = re.sub(u"ў$", u"в", re.sub(u"ь$", "", gen_pl)) if compare_stems(vowel_stem, nonvowel_stem): retvals.append("(-)" if gender == "N" else "") continue if compare_stems( be.dereduce(vowel_stem, epenthetic_stress) or "", nonvowel_stem): retvals.append("*(-)" if gender == "N" else "*") continue if compare_stems( be.dereduce(vowel_stem, not epenthetic_stress) or "", nonvowel_stem): retvals.append("*#(-)" if gender == "N" else "*#") continue if (compare_stems(vowel_stem + u"ав", nonvowel_stem) or compare_stems(vowel_stem + u"яв", nonvowel_stem)): if epenthetic_stress: retvals.append("#" if gender == "N" else u"#(ў)") else: retvals.append("" if gender == "N" else u"(ў)") continue if (compare_stems( be.remove_accents(vowel_stem) + u"о́в", nonvowel_stem) or compare_stems( be.remove_accents(vowel_stem) + u"ё́в", nonvowel_stem)): if epenthetic_stress: retvals.append("" if gender == "N" else u"(ў)") else: retvals.append("#" if gender == "N" else u"#(ў)") continue #for valt in possible_vowel_alternations: # valt_nom_sg = be.apply_vowel_alternation(nom_sg, valt) # if valt_nom_sg: # valt_vowel_stem = vowel_stem_from_vowel_ending_nom_sg(valt_nom_sg) # if be.remove_accents(valt_vowel_stem) == be.remove_accents(nonvowel_stem): # retvals.append("(-)" if gender == "N" else "") # break # if (be.remove_accents(valt_vowel_stem) + u"ав" == be.remove_accents(nonvowel_stem) or # be.remove_accents(valt_vowel_stem) + u"яв" == be.remove_accents(nonvowel_stem)): # retvals.append("" if gender == "N" else u"(ў)") # break #else: # no break pagemsg( "WARNING: Unable to determine relationship between nom_sg %s and gen_pl %s" % (nom_sg, gen_pl)) return ",".join(retvals) else: orig_nom_sg = nom_sg nonvowel_stem = re.sub(u"ь$", "", nom_sg) vowel_stem = re.sub(u"в$", u"ў", re.sub(u"[аяуюыі]́?$", "", gen_sg)) if re.search(be.vowel_c + AC + "?$", vowel_stem): vowel_stem += u"й" if compare_stems(be.reduce(nonvowel_stem) or "", vowel_stem): return "*" nom_sg = re.sub(u"[йь]$", "", nom_sg) nom_sg = re.sub(u"ў$", u"в", nom_sg) m = re.search(u"([аяуюыі]́?)$", gen_sg) if not m: pagemsg( "WARNING: Unrecognized genitive singular ending: %s" % gen_sg) return None ending = m.group(1) if be.is_accented(ending): nom_sg = be.remove_accents(nom_sg) nom_sg += ending if (be.destress_vowels_after_stress_movement(nom_sg) == be.undo_mark_stressed_vowels_in_unstressed_syllables( gen_sg)): return "" pagemsg( "WARNING: Unable to determine relationship between nom_sg %s and gen_sg %s" % (orig_nom_sg, gen_sg)) return None def construct_defaulted_seen_patterns(seen_patterns, lemma, gender, reducible): defaulted_seen_patterns = [] if seen_patterns == ["b", "c"]: seen_patterns = ["c", "b"] elif seen_patterns == ["b", "d"]: seen_patterns = ["d", "b"] if len(seen_patterns) > 1 and "," in reducible: pagemsg( "WARNING: Multiple accent patterns %s and reducible specs %s, not taking Cartesian product" % (",".join(seen_patterns), reducible)) reducible = "" for pattern in seen_patterns: for red in reducible.split(","): defstress = default_stress(lemma, gender, red) if defstress == pattern: if len(seen_patterns) > 1: defaulted_seen_patterns.append(pattern + red) else: defaulted_seen_patterns.append(red) else: defaulted_seen_patterns.append(pattern + red) return ",".join(defaulted_seen_patterns) if tn == "be-decl-noun": check_multi_stressed(14) nom_sg = fetch("1") gen_sg = fetch("3") gen_sg_end_stressed = param_is_end_accented(gen_sg) dat_sg = fetch("5") dat_sg_end_stressed = param_is_end_accented( dat_sg, dative_singular_endings) acc_sg = fetch("7") acc_sg_end_stressed = param_is_end_accented(acc_sg) ins_sg = fetch("9") ins_sg_end_stressed = param_is_end_accented( ins_sg, instrumental_singular_endings) loc_sg = fetch("11") loc_sg_end_stressed = param_is_end_accented( loc_sg, locative_singular_endings) nom_pl = fetch("2") nom_pl_end_stressed = param_is_end_accented(nom_pl) gen_pl = fetch("4") gen_pl_end_stressed = param_is_end_accented(gen_pl) acc_pl = fetch("8") acc_pl_end_stressed = param_is_end_accented(acc_pl) ins_pl = fetch("10") ins_pl_end_stressed = param_is_end_accented( ins_pl, instrumental_plural_endings) loc_pl = fetch("12") loc_pl_end_stressed = param_is_end_accented(loc_pl) if (gen_sg_end_stressed == "unknown" or acc_sg_end_stressed == "unknown" or nom_pl_end_stressed == "unknown" or loc_pl_end_stressed == "unknown"): pagemsg( "WARNING: Missing stresses, can't determine accent pattern: %s" % unicode(t)) continue seen_patterns = [] for pattern, accents in accent_patterns: if (matches(ins_sg_end_stressed, accents["inssg"]) and matches(acc_sg_end_stressed, accents["accsg"]) and matches(nom_pl_end_stressed, accents["nompl"]) and matches(loc_pl_end_stressed, accents["locpl"])): seen_patterns.append(pattern) if "a" in seen_patterns and "b" in seen_patterns: # If a and b apply, most others can apply as well seen_patterns = ["a", "b"] elif "a" in seen_patterns and "c" in seen_patterns: # If a and c apply, e can apply as well seen_patterns = ["a", "c"] elif "a" in seen_patterns and "d" in seen_patterns: # If a and d apply, d' can apply as well seen_patterns = ["a", "d"] elif "b" in seen_patterns and "d" in seen_patterns: # If b and d apply, f can apply as well seen_patterns = ["b", "d"] gen_sg_endings = fetch_endings("3", genitive_singular_endings) dat_sg_endings = fetch_endings("5", dative_singular_endings) ins_sg_endings = fetch_endings("9", instrumental_singular_endings) loc_sg_endings = fetch_endings("11", locative_singular_endings) nom_pl_endings = fetch_endings("2", nominative_plural_endings) gen_pl_endings = fetch_endings("4", genitive_plural_endings) if not heads: pagemsg("WARNING: No head found") heads = [pagetitle] pagemsg( "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tgen_pl:%s\tnumber:both\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tnom_pl:%s\tgen_pl:%s\t| %s || \"?\" || %s || %s || %s || %s || %s || %s|| " % ("/".join(heads), gender, animacy, ":".join(seen_patterns), stress(gen_sg_end_stressed), stress(dat_sg_end_stressed), stress(loc_sg_end_stressed), stress(gen_pl_end_stressed), gen_sg_endings, dat_sg_endings, loc_sg_endings, nom_pl_endings, gen_pl_endings, canon(nom_sg), canon(gen_sg), canon(loc_sg), canon(nom_pl), canon(gen_pl), canon(ins_pl), ins_sg_note(ins_sg))) if len(heads) > 1: pagemsg( "WARNING: Multiple heads, not inferring declension: %s" % ",".join(heads)) continue if gender == "unknown" or animacy == "unknown": pagemsg( "WARNING: Unknown gender or animacy, not inferring declension" ) continue defan = infer_animacy(nom_pl, gen_pl, acc_pl) if not (defan == "in" and animacy == "in" or defan == "an" and animacy in ["pr", "anml"]): pagemsg( "WARNING: Inferred animacy %s != explicit animacy %s, not inferring declension" % (defan, animacy)) continue lemma = heads[0] parts = [] defg = infer_gender(lemma) if gender != defg: parts.append(gender) alternation = infer_alternations(nom_sg, nom_pl) def apply_alternations(form): forms = re.split(", *", form) forms = [ be.apply_vowel_alternation(form, alternation) or form for form in forms ] return ", ".join(forms) nom_sg = apply_alternations(nom_sg) reducible = infer_reducible(nom_sg, gen_sg, gen_pl, gender, seen_patterns) or "" defaulted_seen_patterns = construct_defaulted_seen_patterns( seen_patterns, lemma, gender, reducible) if defaulted_seen_patterns: parts.append(defaulted_seen_patterns) if animacy != "in": parts.append(animacy) if alternation in ["ae", "ao", "yo"]: parts.append(alternation) if gender == "M": if re.search(u"у́?$", gen_sg): parts.append("genu") elif re.search(u"ю́?$", gen_sg): parts.append("genju") pagemsg("Inferred declension %s<%s>" % (lemma, ".".join(parts))) elif tn == "be-decl-noun-unc": check_multi_stressed(7) nom_sg = fetch("1") gen_sg = fetch("2") gen_sg_end_stressed = param_is_end_accented(gen_sg) dat_sg = fetch("3") dat_sg_end_stressed = param_is_end_accented( dat_sg, dative_singular_endings) acc_sg = fetch("4") acc_sg_end_stressed = param_is_end_accented(acc_sg) ins_sg = fetch("5") ins_sg_end_stressed = param_is_end_accented( ins_sg, instrumental_singular_endings) loc_sg = fetch("6") loc_sg_end_stressed = param_is_end_accented( loc_sg, locative_singular_endings) if (gen_sg_end_stressed == "unknown" or acc_sg_end_stressed == "unknown"): pagemsg( "WARNING: Missing stresses, can't determine accent pattern: %s" % unicode(t)) continue if not heads: pagemsg("WARNING: No head found") heads = [pagetitle] lemma = heads[0] seen_patterns = [] for pattern, accents in accent_patterns: if pattern not in [ "a", "d" if re.search(u"[аяеёо]́?$", lemma) else "b" ]: continue if (matches(ins_sg_end_stressed, accents["inssg"]) and matches(acc_sg_end_stressed, accents["accsg"])): seen_patterns.append(pattern) if "a" in seen_patterns and "b" in seen_patterns: seen_patterns = ["a", "b"] if "a" in seen_patterns and "d" in seen_patterns: seen_patterns = ["a", "d"] gen_sg_endings = fetch_endings("2", genitive_singular_endings) dat_sg_endings = fetch_endings("3", dative_singular_endings) ins_sg_endings = fetch_endings("5", instrumental_singular_endings) loc_sg_endings = fetch_endings("6", locative_singular_endings) pagemsg( "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tgen_pl:-\tnumber:sg\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tnom_pl:-\tgen_pl:-\t| %s || \"?\" || %s || %s || - || - || - || %s|| " % ("/".join(heads), gender, animacy, ":".join(seen_patterns), stress(gen_sg_end_stressed), stress(dat_sg_end_stressed), stress(loc_sg_end_stressed), gen_sg_endings, dat_sg_endings, loc_sg_endings, canon(nom_sg), canon(gen_sg), canon(loc_sg), ins_sg_note(ins_sg))) if len(heads) > 1: pagemsg( "WARNING: Multiple heads, not inferring declension: %s" % ",".join(heads)) continue if gender == "unknown" or animacy == "unknown": pagemsg( "WARNING: Unknown gender or animacy, not inferring declension" ) continue parts = [] defg = infer_gender(lemma) if gender != defg: parts.append(gender) reducible = infer_reducible(nom_sg, gen_sg, None, gender, seen_patterns) or "" defaulted_seen_patterns = construct_defaulted_seen_patterns( seen_patterns, lemma, gender, reducible) if defaulted_seen_patterns: parts.append(defaulted_seen_patterns) if animacy != "in": parts.append(animacy) parts.append("sg") if gender == "M" and re.search("^" + be.uppercase_c, lemma): if re.search(u"у́?$", gen_sg): parts.append("genu") elif re.search(u"ю́?$", gen_sg): parts.append("genju") pagemsg("Inferred declension %s<%s>" % (lemma, ".".join(parts))) elif tn == "be-decl-noun-pl": check_multi_stressed(7) nom_pl = fetch("1") nom_pl_end_stressed = param_is_end_accented(nom_pl) gen_pl = fetch("2") gen_pl_end_stressed = param_is_end_accented(gen_pl) ins_pl = fetch("5") ins_pl_end_stressed = param_is_end_accented( ins_pl, instrumental_plural_endings) loc_pl = fetch("6") loc_pl_end_stressed = param_is_end_accented(loc_pl) if (nom_pl_end_stressed == "unknown" or loc_pl_end_stressed == "unknown"): pagemsg( "WARNING: Missing stresses, can't determine accent pattern: %s" % unicode(t)) continue seen_patterns = [] for pattern, accents in accent_patterns: if pattern not in ["a", "b", "e"]: continue if (matches(nom_pl_end_stressed, accents["nompl"]) and matches(loc_pl_end_stressed, accents["locpl"])): seen_patterns.append(pattern) if "a" in seen_patterns and "b" in seen_patterns: seen_patterns = ["a", "b"] nom_pl_endings = fetch_endings("1", nominative_plural_endings) gen_pl_endings = fetch_endings("2", genitive_plural_endings) if not heads: pagemsg("WARNING: No head found") heads = [pagetitle] pagemsg( "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:-\tdat_sg:-\tloc_sg:-\tgen_pl:%s\tnumber:pl\tgen_sg:-\tdat_sg:-\tloc_sg:-\tnom_pl:%s\tgen_pl:%s\t| %s || \"?\" || - || - || %s || %s || %s || || " % ("/".join(heads), gender, animacy, ":".join(seen_patterns), stress(gen_pl_end_stressed), nom_pl_endings, gen_pl_endings, canon(nom_pl), canon(nom_pl), canon(gen_pl), canon(ins_pl)))
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] if "sa-noun" not in text and "sa-decl-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) headt = None saw_decl = False for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "sa-noun": pagemsg("Saw headt=%s" % unicode(t)) if headt and not saw_decl: pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t))) headt = t saw_decl = False continue if tn in ["sa-decl-noun", "sa-decl"]: pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None)) continue if tn.startswith("sa-decl-noun-"): pagemsg("Saw declt=%s" % unicode(t)) if not headt: pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t))) continue saw_decl = True tr = getparam(headt, "tr") accented_tr = False if not tr: tr = expand_text("{{xlit|sa|%s}}" % pagetitle) pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t))) else: if "-" in tr: pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) tr = tr.replace("-", "") decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś") if AC not in decomptr and GR not in decomptr: pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) else: accented_tr = True genders = blib.fetch_param_chain(headt, "g") genders = [g.replace("-p", "").replace("bysense", "") for g in genders] genders = [g for gs in genders for g in ( ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs] )] if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]: tg = tn[-1] if tg not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( tg, ",".join(genders), unicode(headt), unicode(t))) continue decltr = getparam(t, "1") if not decltr: if not accented_tr: pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn)) else: pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add accented translit %s to {{%s}}" % (tr, tn)) elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari if not accented_tr: pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn)) else: pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn)) else: decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś") subbed = False if AC not in decompdecltr and GR not in decompdecltr: if accented_tr: pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" % (decltr, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn)) subbed = True else: pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) if not subbed and "-" in decltr: pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn)) decltr = decltr.replace("-", "") t.add("1", decltr) subbed = True stripped_decltr = decltr.strip() if "\n" not in decltr and stripped_decltr != decltr: pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn)) decltr = stripped_decltr t.add("1", decltr) subbed = True continue if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"): pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t))) continue if tn in old_template_to_gender: must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "4", "n"]: pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t), unicode(headt))) must_continue = True break if must_continue: continue g = old_template_to_gender[tn] if g not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( g, ",".join(genders), unicode(headt), unicode(t))) continue blib.set_template_name(t, "sa-decl-noun-%s" % g) rmparam(t, "n") rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") t.add("1", tr) notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g)) else: pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if headt: pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt)) return unicode(parsed), notes
if not headword_template: pagemsg("WARNING: Can't find headword template, skipping") return pagemsg("Found headword template: %s" % unicode(headword_template)) headword_is_proper = unicode(headword_template.name) == "ru-proper noun" if getparam( headword_template, "3" ) == "-" or "[[Category:Russian indeclinable nouns]]" in page.text: pagemsg("WARNING: Indeclinable noun, skipping") return headword_trs = blib.fetch_param_chain(headword_template, "tr", "tr") if headword_trs: pagemsg("WARNING: Found headword manual translit, skipping: %s" % ",".join(headword_trs)) return headword = getparam(headword_template, "1") for badparam in ["head2", "gen2", "pl2"]: val = getparam(headword_template, badparam) if val: pagemsg( "WARNING: Found extra param, can't handle, skipping: %s=%s" % (badparam, val)) return # Here we use a capturing split, and treat what we want to capture as
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if old_adj_template not in text and "es-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "es-noun" and args.remove_redundant_noun_args: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) if not getparam(t, "2") and (getparam(t, "pl2") or getparam(t, "pl3")): pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" % unicode(t)) continue g = getparam(t, "1") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls and not g.endswith("-p"): pagemsg( "WARNING: Space in m=%s and old default noun algorithm applying" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls and not g.endswith("-p"): pagemsg( "WARNING: Space in f=%s and old default noun algorithm applying" % ",".join(fs)) pls = blib.fetch_param_chain(t, "2", "pl") if not pls and not g.endswith("-p"): if " " in lemma: pagemsg( "WARNING: Space in headword and old default noun algorithm applying" ) continue pls_with_def = [] defpl = make_plural(lemma) if not defpl: continue if len(defpl) > 1: if set(pls) == set(defpl): pls_with_def = ["+"] elif set(pls) < set(defpl): pagemsg( "WARNING: pls=%s subset of defpls=%s, replacing with default" % (",".join(pls), ",".join(defpl))) pls_with_def = ["+"] else: pls_with_def = pls else: for pl in pls: if pl == defpl[0]: pls_with_def.append("+") else: pls_with_def.append(pl) actual_special = None for special in all_specials: special_pl = make_plural(lemma, special) if special_pl is None: continue if len(special_pl) > 1 and set(pls) < set(special_pl): pagemsg( "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing" % (special, ",".join(pls), ",".join(special_pl))) actual_special = special break if set(pls) == set(special_pl): pagemsg("Found special=%s with special_pl=%s" % (special, ",".join(special_pl))) actual_special = special break if pls_with_def == ["+"]: notes.append("remove redundant plural%s %s from {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls))) blib.remove_param_chain(t, "2", "pl") elif actual_special: notes.append("replace plural%s %s with +%s in {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls), actual_special)) blib.set_param_chain(t, ["+" + actual_special], "2", "pl") elif pls_with_def != pls: notes.append( "replace default plural %s with '+' in {{es-noun}}" % ",".join(defpl)) blib.set_param_chain(t, pls_with_def, "2", "pl") def handle_mf(mf, mf_full, make_mf): mfs = blib.fetch_param_chain(t, mf, mf) mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl") if mfs and not any(x.startswith("+") for x in mfs): defmf = make_mf(lemma) if set(mfs) == {defmf}: defpls = make_plural(defmf) ok = False if not mfpls or set(mfpls) == set(defpls): ok = True elif set(mfpls) < set(defpls): pagemsg( "WARNING: %pl=%s subset of default=%s, allowing" % (mf, ",".join(mfpls), ",".join(defpls))) ok = True if ok: notes.append( "replace %s=%s with '+' in {{es-noun}}" % (mf, ",".join(mfs))) blib.set_param_chain(t, ["+"], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") return actual_special = None for special in all_specials: special_mf = make_mf(lemma, special) if special_mf is None: continue if mfs == [special_mf]: pagemsg("Found special=%s with special_mf=%s" % (special, special_mf)) actual_special = special break if actual_special: if not mfpls: pagemsg( "WARNING: Explicit %s=%s matches special=%s but no %s plural" % (mf, ",".join(mfs), actual_special, mf_full)) else: special_mfpl = make_plural(special_mf, actual_special) if special_mfpl: if len(special_mfpl) > 1 and set(mfpls) < set( special_mfpl): pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) elif set(mfpls) == set(special_mfpl): pagemsg( "Found %s=%s and special=%s, %spls=%s matches special_%spl" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf)) else: pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) actual_special = None if actual_special: notes.append( "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural" % (mf_full, ",".join(mfs), actual_special, mf_full)) blib.set_param_chain(t, ["+%s" % actual_special], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") if not actual_special: defmf = make_mf(lemma) mfs_with_def = ["+" if x == defmf else x for x in mfs] if mfs_with_def != mfs: notes.append( "replace default %s %s with '+' in {{es-noun}}" % (mf_full, defmf)) blib.set_param_chain(t, mfs_with_def, mf, mf) if mfpls: defpl = [ x for y in mfs for x in (make_plural(y) or []) ] ok = False if set(defpl) == set(mfpls): ok = True elif len(defpl) > 1 and set(mfpls) < set(defpl): pagemsg( "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing" % (mf, ",".join(mfs), mf, ",".join(mfpls), ",".join(defpl))) ok = True if ok: pagemsg( "Found %s=%s, %spl=%s matches default pl" % (mf, ",".join(mfs), mf, ",".join(mfpls))) notes.append( "remove redundant explicit %s plural %s in {{es-noun}}" % (mf_full, ",".join(mfpls))) blib.remove_param_chain( t, mf + "pl", mf + "pl") else: for special in all_specials: defpl = [ x for y in mfs for x in ( make_plural(y, special) or []) ] if set(defpl) == set(mfpls): pagemsg( "Found %s=%s, %spl=%s matches special=%s" % (mf, ",".join(mfs), mf, ",".join(mfpls), special)) notes.append( "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}" % (mf_full, ",".join(mfpls), special)) blib.set_param_chain( t, ["+%s" % special], mf + "pl", mf + "pl") handle_mf("f", "feminine", make_feminine) handle_mf("m", "masculine", make_masculine) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: pagemsg("No changes to %s" % unicode(t)) if tn == "es-noun" and args.make_multiword_plural_explicit: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) if " " in lemma and not getparam(t, "2"): g = getparam(t, "1") if not g.endswith("-p"): explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" % (lemma, g)) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string" ) continue plurals = explicit_pl.split(",") blib.set_param_chain(t, plurals, "2", "pl") notes.append("add explicit plural to multiword noun") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls: mpls = [] for m in ms: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" % (blib.remove_links(m))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string" % m) continue this_mpls = explicit_pl.split(",") mpls.extend(this_mpls) blib.set_param_chain(t, mpls, "mpl", "mpl") notes.append("add explicit plural to m=%s" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") fpls = blib.fetch_param_chain(t, "fpl", "fpl") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls: fpls = [] for f in fs: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" % (blib.remove_links(f))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string" % f) continue this_fpls = explicit_pl.split(",") fpls.extend(this_fpls) blib.set_param_chain(t, fpls, "fpl", "fpl") notes.append("add explicit plural to f=%s" % ",".join(fs)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if tn == old_adj_template: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) deff = make_feminine(pagetitle) defmpl = make_plural(pagetitle) fs = [] fullfs = [] f = getparam(t, "f") or pagetitle fullfs.append(f) if f == deff: f = "+" elif f == lemma: f = "#" fs.append(f) f2 = getparam(t, "f2") if f2: fullfs.append(f2) if f2 == deff: f2 == "+" fs.append(f2) mpls = [] mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s" mpls.append(mpl) mpl2 = getparam(t, "mpl2") or getparam(t, "pl2") if mpl2: mpls.append(mpl2) fullmpls = mpls # should really check for subsequence but it never occurs if set(mpls) == set(defmpl): mpls = ["+"] elif set(mpls) < set(defmpl): pagemsg( "WARNING: mpls=%s subset of defmpl=%s, replacing with default" % (",".join(mpls), ",".join(defmpl))) mpls = ["+"] mpls = ["#" if x == lemma else x for x in mpls] deffpl = [x for f in fullfs for x in make_plural(f)] fpls = [] fpl = getparam(t, "fpl") or getparam( t, "pl") or (getparam(t, "f") or pagetitle) + "s" fpls.append(fpl) fpl2 = getparam(t, "fpl2") or getparam(t, "pl2") if fpl2: fpls.append(fpl2) fullfpls = fpls # should really check for subsequence but it never occurs if set(fpls) == set(deffpl): fpls = ["+"] elif set(fpls) < set(deffpl): pagemsg( "WARNING: fpls=%s subset of deffpl=%s, replacing with default" % (",".join(fpls), ",".join(deffpl))) fpls = ["+"] fpls = ["#" if x == lemma else x for x in fpls] actual_special = None for special in all_specials: deff = make_feminine(pagetitle, special) if deff is None: continue defmpl = make_plural(pagetitle, special) deffpl = make_plural(deff, special) deff = [deff] if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl: actual_special = special break head = getparam(t, "head") must_continue = False for param in t.params: pn = pname(param) pv = unicode(param.value) if pn == "1" and pv in ["m", "mf"]: pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" % (pn, pv, unicode(t))) continue if pn not in [ "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl", "fpl2" ]: pagemsg("WARNING: Saw unrecognized param %s=%s in %s" % (pn, pv, unicode(t))) must_continue = True break if must_continue: continue del t.params[:] if head: t.add("head", head) if fullfs == [pagetitle] and fullmpls == [ pagetitle ] and fullfpls == [pagetitle]: blib.set_template_name(t, "es-adj-inv") else: blib.set_template_name(t, "es-adj") if actual_special: t.add("sp", actual_special) else: if fs != ["+"]: blib.set_param_chain(t, fs, "f", "f") if mpls == fpls and ("+" not in mpls or defmpl == deffpl): # masc and fem pl the same if mpls != ["+"]: blib.set_param_chain(t, mpls, "pl", "pl") else: if mpls != ["+"]: blib.set_param_chain(t, mpls, "mpl", "mpl") if fpls != ["+"]: blib.set_param_chain(t, fpls, "fpl", "fpl") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to new {{%s}} format" % (old_adj_template, tname(t))) else: pagemsg("No changes to %s" % unicode(t)) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose) def verify_template_is_full_line(tn, line): templates = list(blib.parse_text(line).filter_templates()) if type(tn) is list: tns = tn else: tns = [tn] tntext = "/".join(tns) if len(templates) == 0: pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line)) return None t = templates[0] if tname(t) not in tns: pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" % (tntext, tntext, line)) return None if unicode(t) != line: pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line)) return None return t notes = [] retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) sect_for_wiki = 0 for k in xrange(1, len(subsections), 2): if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]): sect_for_wiki = k + 1 elif re.search(r"==\s*Pronunciation\s*==", subsections[k]): secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k]) if secheader != subsections[k]: subsections[k] = secheader notes.append("remove extraneous spaces in ==Pronunciation== header") extra_notes = [] parsed = blib.parse_text(subsections[k + 1]) num_it_IPA = 0 saw_it_pr = False for t in parsed.filter_templates(): tn = tname(t) if tn in ["it-pr", "it-pronunciation"]: saw_it_pr = True break if tn == "it-IPA": num_it_IPA += 1 if saw_it_pr: pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t)) continue if num_it_IPA == 0: pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping") continue if num_it_IPA > 1: pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping") continue lines = subsections[k + 1].strip().split("\n") # Remove blank lines. lines = [line for line in lines if line] hyph_lines = [] homophone_lines = [] rfap_lines = [] rhyme_lines = [] must_continue = False audioarg = "" args = [] bare_args = [] args_for_hyph = [] lines_so_far = [] for lineind, line in enumerate(lines): origline = line lines_so_far.append(line) # In case of "* {{it-IPA|...}}", chop off the "* ". line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line) if line.startswith("{{it-IPA"): if args: pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline) must_continue = True break outer_ref_arg = None m = re.search("^(.*?) *<ref>(.*?)</ref>$", line) if m: line, outer_ref_arg = m.groups() ipat = verify_template_is_full_line("it-IPA", line) if ipat is None: must_continue = True break bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"] bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args] bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args] bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args] normalized_bare_args = [ normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline))) for arg in bare_args ] if None in normalized_bare_args: must_continue = True break args = [x for x in bare_args] args_for_hyph = [] for arg in normalized_bare_args: hypharg = ( arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z") .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z") ) hypharg = re.sub(pron_sign_c, "", hypharg) putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", "")) putative_pagetitle = remove_non_final_accents(putative_pagetitle) # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want # pronunciation rè to match page title ré or vice versa.) if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle: args_for_hyph.append(hypharg) for param in ipat.params: pn = pname(param) pv = unicode(param.value) if re.search("^[0-9]+$", pn): continue m = re.search("^(ref|qual)([0-9]*)$", pn) if m: parampref, argnum = m.groups() argnum = int(argnum or "1") - 1 if argnum >= len(args): pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % ( pn, pv, origline)) must_continue = True break args[argnum] += "<%s:%s>" % (parampref, pv) else: pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if outer_ref_arg: if "<ref:" in args[-1]: pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s" % (outer_ref_arg, args[-1], origline)) must_continue = True break else: args[-1] += "<ref:%s>" % outer_ref_arg extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}") continue if line.startswith("{{rfap"): line = "* " + line if line.startswith("{{wiki"): subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki] # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section. del lines_so_far[-1] subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:]) notes.append("move {{wikipedia}} line to top of etym section") continue if not line.startswith("* ") and not line.startswith("*{"): pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s" % origline) must_continue = True break if line.startswith("* "): line = line[2:] else: line = line[1:] if line.startswith("{{hyph"): hyph_lines.append("* " + line) elif line.startswith("{{homophone"): homophone_lines.append("* " + line) elif line.startswith("{{rfap"): rfap_lines.append(line) elif line.startswith("{{audio"): audiot = verify_template_is_full_line("audio", line) if audiot is None: must_continue = True break if getparam(audiot, "1") != "it": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline) must_continue = True break audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if audiogloss in ["Audio", "audio"]: audiogloss = "" if audiogloss: audiogloss = ";%s" % audiogloss audiopart = "<audio:%s%s>" % (audiofile, audiogloss) audioarg += audiopart pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart)) extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot)) elif line.startswith("{{rhyme"): rhyme_lines.append(line) elif remove_accents(line) == remove_accents(pagetitle): pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline) else: pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline) must_continue = True break if must_continue: continue if rhyme_lines: rhyme_error = False rhyme_pronuns = [] for bare_arg in normalized_bare_args: pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg)) if not pronun: rhyme_error = True break rhyme_pronun = ( re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "") .replace(".", "")) if rhyme_pronun not in rhyme_pronuns: rhyme_pronuns.append(rhyme_pronun) if not rhyme_error: saw_non_matching_rhyme = False normalized_rhymes = [] rhyme_line_text = ", ".join(rhyme_lines) normalized_bare_arg_text = ",".join(normalized_bare_args) rhyme_pronun_text = ",".join(rhyme_pronuns) for rhyme_line in rhyme_lines: rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line) if not rhymet: break if getparam(rhymet, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line)) break rhymes = [] must_break = False num_syl = "" rhyme_specific_num_syl = [] for param in rhymet.params: pn = pname(param) pv = unicode(param.value) if not re.search("^s?[0-9]*$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(rhymet), rhyme_line)) must_break = True break if pn == "s": num_syl = "<s:%s>" % pv elif pn.startswith("s"): rhyme_no = int(pn[1:]) - 1 rhyme_specific_num_syl.append((rhyme_no, pv)) elif int(pn) > 1: if pv: rhymes.append([pv, ""]) if must_break: break for rhyme_no, this_num_syl in rhyme_specific_num_syl: if rhyme_no >= len(rhymes): pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % ( rhyme_no + 1, this_num_syl, rhyme_line)) must_break = True break rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl if must_break: break for rhyme, this_num_syl in rhymes: normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm") normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme) this_num_syl = this_num_syl or num_syl if this_num_syl and not args_for_hyph and not hyph_lines: pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s" % (this_num_syl, rhyme, rhyme_line_text)) saw_non_matching_rhyme = True normalized_rhymes.append(normalized_rhyme + this_num_syl) else: normalized_rhymes.append(normalized_rhyme) if rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_bare_arg_text, rhyme_line_text)) elif normalized_rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text)) elif rhyme != normalized_rhyme: pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: # no break if saw_non_matching_rhyme: pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s" % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes) extra_notes.append("incorporate non-default rhymes into {{it-pr}}") else: extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}") rhyme_lines = [] if not args: pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?") continue args[-1] += audioarg if hyph_lines: if len(hyph_lines) > 1: pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines)) else: assert hyph_lines[0].startswith("* ") hyph_line = hyph_lines[0][2:] hyph_templates = re.split(", *", hyph_line) hyphs = [] for hyph_template in hyph_templates: hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template) if not hypht: break syls = [] if getparam(hypht, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template)) break else: must_break = False for param in hypht.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn) and pn != "nocaption": pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hypht), hyph_line)) must_break = True break if pn != "nocaption" and int(pn) > 1: if not pv: hyphs.append(syls) syls = [] else: syls.append(pv) if must_break: break if syls: hyphs.append(syls) else: # no break if hyphs: specified_hyphenations = [".".join(syls) for syls in hyphs] specified_hyphenations = [ re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations] specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations] specified_hyphenations = [ adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations] specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations] hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph] if set(specified_hyphenations) < set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(specified_hyphenations) != set(hyphenations): hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations] rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations] def indices_of_syllable_markers(hyph): # Get the character indices of the syllable markers, but not counting the syllable markers themselves # (i.e. return the number of characters preceding the syllable marker). raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."] adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)] return set(adjusted_indices) if set(specified_hyphenations) == set(hyphenations_without_accents): pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(rehyphenated_specified_hyphenations) == set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1 and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "") and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) else: if not hyphenations: pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" % (",".join(specified_hyphenations), hyph_line)) else: pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations) extra_notes.append("incorporate non-default hyphenations into {{it-pr}}") else: pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line) extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}") hyph_lines = [] if homophone_lines: if len(homophone_lines) > 1: pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines)) else: assert homophone_lines[0].startswith("* ") homophone_line = homophone_lines[0][2:] homophones = {} homophone_qualifiers = {} hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line) if hmpt: if getparam(hmpt, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line)) else: for param in hmpt.params: pn = pname(param) pv = unicode(param.value) if not re.search("^q?[0-9]+$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hmpt), homophone_line)) break if pn.startswith("q"): homophone_qualifiers[int(pn[1:])] = pv elif int(pn) > 1: homophones[int(pn) - 1] = pv else: # no break hmp_args = [] for pn, pv in sorted(homophones.items()): hmp_args.append(pv) if pn in homophone_qualifiers: hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn] args[-1] += "<hmp:%s>" % ",".join(hmp_args) extra_notes.append("incorporate homophones into {{it-pr}}") homophone_lines = [] if args == ["+"]: it_pr = "{{it-pr}}" else: it_pr = "{{it-pr|%s}}" % ",".join(args) pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr)) all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines) newsubsec = "%s\n\n" % all_lines if subsections[k + 1] != newsubsec: this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes notes.extend(this_notes) subsections[k + 1] = newsubsec secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): if tname(t) == "bg-noun-form": origt = unicode(t) must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "head"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue rmparam(t, "1") rmparam(t, "2") head = getparam(t, "head") rmparam(t, "head") g = getparam(t, "3") rmparam(t, "3") blib.set_template_name(t, "head") t.add("1", "bg") t.add("2", "noun form") if head: t.add("head", head) else: if bglib.needs_accents(pagetitle): pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" % unicode(t)) else: t.add("head", pagetitle) if g: t.add("g", g) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}") headt = None saw_infl_after_head = False saw_headt = False saw_inflt = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) saw_infl = False already_fetched_forms = False if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form": saw_headt = True if headt and not saw_infl_after_head: pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % ( unicode(headt), origt)) saw_infl_after_head = False headt = t if tn == "bg-noun form of": saw_inflt = True if not headt: pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt) continue must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "noun"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue saw_infl_after_head = True noun = getparam(t, "noun") if not noun: pagemsg("WARNING: Didn't see noun=: %s" % origt) continue infls = [] param2 = getparam(t, "2") if param2 == "indefinite": infls.append("indef") elif param2 == "definite": infls.append("def") elif param2 == "vocative": infls.append("voc") elif param2: pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt)) continue param3 = getparam(t, "3") if param3 == "subject": infls.append("sbjv") elif param3 == "object": infls.append("objv") elif param3: pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt)) continue param1 = getparam(t, "1") if param1 == "singular": infls.append("s") elif param1 == "plural": infls.append("p") elif param1 == "count": infls.extend(["count", "form"]) elif param1 == "vocative": infls.extend(["voc", "s"]) else: pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt)) continue blib.set_template_name(t, "inflection of") del t.params[:] t.add("1", "bg") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) t.add("2", noun) else: t.add("2", lemma) t.add("3", "") for i, infl in enumerate(infls): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{bg-noun form of}} to {{inflection of}}") tn = tname(t) saw_infls = infls_to_slot(infls) already_fetched_forms = True if not saw_infls: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "inflection of" and getparam(t, "1") == "bg": saw_inflt = True infls = [] i = 4 while True: infl = getparam(t, str(i)) if not infl: break infls.append(infl) i += 1 saw_infls = infls_to_slot(infls) if not saw_infls: if "vnoun" in infls: pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt)) elif "part" in infls: pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt)) else: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "definite singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_sg" elif tn == "indefinite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "ind_pl" elif tn == "definite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_pl" elif tn == "vocative singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "voc_sg" if saw_infl: if not already_fetched_forms: noun = getparam(t, "2") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) continue t.add("2", lemma) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn)) if saw_infl == "def_sg": def_sub_sg = forms.get("def_sub_sg", None) def_obj_sg = forms.get("def_obj_sg", None) if def_sub_sg != def_obj_sg: pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % ( def_sub_sg, def_obj_sg)) continue form = def_sub_sg else: form = forms.get(saw_infl, None) if not form: pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" % (saw_infl, format_forms(forms))) continue form = form.split(",") filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle] if not filtered_form: pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form))) continue form = filtered_form existing_form = blib.fetch_param_chain(headt, "head", "head") if existing_form: must_continue = False for f in existing_form: if bglib.remove_accents(f) != pagetitle: pagemsg("WARNING: Existing head %s doesn't match page title: %s" % ( f, unicode(headt))) must_continue = True break if must_continue: continue needs_accents = [bglib.needs_accents(f) for f in existing_form] if any(needs_accents) and not all(needs_accents): pagemsg("WARNING: Some but not all existing heads missing accents: %s" % unicode(headt)) continue if not any(needs_accents): if existing_form != form: pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % ( saw_infl, ",".join(existing_form), ",".join(form))) continue origheadt = unicode(headt) blib.set_param_chain(headt, form, "head", "head") pagemsg("Replaced %s with %s" % (origheadt, unicode(headt))) notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form))) if saw_headt and not saw_inflt: pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt)) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in template_to_infl_codes and getparam(t, "1") == "bg": must_continue = False for param in t.params: if pname(param) not in ["1", "2"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue infl_codes = template_to_infl_codes[tn] blib.set_template_name(t, "inflection of") t.add("3", "") for i, infl in enumerate(infl_codes): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{inflection of}}" % tn) return unicode(parsed), notes
pagemsg("WARNING: Found ru-pre-reform template, skipping") return if not headword_template: pagemsg("WARNING: Can't find headword template, skipping") return pagemsg("Found headword template: %s" % unicode(headword_template)) headword_is_proper = unicode(headword_template.name) == "ru-proper noun" if getparam(headword_template, "3") == "-" or "[[Category:Russian indeclinable nouns]]" in page.text: pagemsg("WARNING: Indeclinable noun, skipping") return headword_trs = blib.fetch_param_chain(headword_template, "tr", "tr") if headword_trs: pagemsg("WARNING: Found headword manual translit, skipping: %s" % ",".join(headword_trs)) return headword = getparam(headword_template, "1") for badparam in ["head2", "gen2", "pl2"]: val = getparam(headword_template, badparam) if val: pagemsg("WARNING: Found extra param, can't handle, skipping: %s=%s" % ( badparam, val)) return # Here we use a capturing split, and treat what we want to capture as # the splitting text, backwards from what you'd expect. The separators
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): tn = tname(t) if tn not in ["la-noun", "la-proper noun"]: continue origt = unicode(t) def render_headword(): return "headword template <from> %s <to> %s <end>" % (origt, origt) if getparam(t, "indecl"): pagemsg("Skipping indeclinable noun: %s" % render_headword()) continue new_style_headword_template = (not getparam(t, "head2") and not getparam(t, "2") and not getparam(t, "3") and not getparam(t, "4") and not getparam(t, "decl")) if new_style_headword_template: pagemsg("Skipping new-style template: %s" % render_headword()) continue lemma = blib.fetch_param_chain(t, ["1", "head", "head1"], "head") or [pagetitle] genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen") noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g") noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl") if " " in lemma[0]: pagemsg("WARNING: Space in lemma %s, skipping: %s" % (lemma[0], render_headword())) continue if len(lemma) > 1: pagemsg("WARNING: Multiple lemmas %s, skipping: %s" % (",".join(lemma), render_headword())) continue lemma = lemma[0] noun_decl_to_decl_type = { "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5", "irregular": "irreg", } if len(noun_decl) == 0: pagemsg("WARNING: No declension, skipping: %s" % render_headword()) continue if len(noun_decl) > 1: pagemsg("WARNING: Multiple decls %s, skipping: %s" % (",".join(noun_decl), render_headword())) continue noun_decl = noun_decl[0] if noun_decl not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized declension %s, skipping: %s" % (noun_decl, render_headword())) continue decl_type = noun_decl_to_decl_type[noun_decl] if decl_type in ["1", "2", "4", "5"]: param1 = "%s<%s>" % (lemma, decl_type) elif decl_type == "3": if len(genitive) == 0: pagemsg( "WARNING: No genitives with decl 3 lemma %s, skipping: %s" % (lemma, render_headword())) continue elif len(genitive) > 1: pagemsg( "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s" % (",".join(genitive), lemma, render_headword())) continue else: gen1 = genitive[0] if gen1.endswith("is"): stem = gen1[:-2] if lalib.infer_3rd_decl_stem(lemma) == stem: param1 = "%s<3>" % lemma else: param1 = "%s/%s<3>" % (lemma, stem) elif gen1.endswith("ium"): if lemma.endswith("ia"): param1 = "%s<3.pl>" % lemma elif lemma.endswith(u"ēs"): param1 = "%s<3.I.pl>" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s" % (lemma, render_headword())) continue elif gen1.endswith("um"): if lemma.endswith("a") or lemma.endswith(u"ēs"): param1 = "%s<3.pl>" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s" % (lemma, render_headword())) continue else: pagemsg( "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s" % (gen1, lemma, render_headword())) continue elif decl_type == "irreg": pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" % render_headword()) continue else: pagemsg( "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s" % (decl_type, render_headword())) continue la_ndecl = "{{la-ndecl|%s}}" % param1 noun_props = convert_la_headword_noun.new_generate_noun_forms( la_ndecl, errandpagemsg, expand_text, include_props=True) if noun_props is None: continue decl_gender = noun_props.get("g", None) if not convert_la_headword_noun.compare_headword_decl_forms( "genitive", genitive, ["gen_sg", "gen_pl"], noun_props, render_headword(), pagemsg, adjust_for_missing_gen_forms=True, adjust_for_e_ae_gen=True, remove_headword_links=True): continue if len(noun_gender) == 1 and noun_gender[0] == decl_gender: need_explicit_gender = False else: need_explicit_gender = True if len(noun_gender) > 1: pagemsg( "WARNING: Saw multiple headword genders %s, please verify: %s" % (",".join(noun_gender), render_headword())) elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")): pagemsg( "WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % (noun_gender[0], decl_gender, render_headword())) continue # Fetch remaining params from headword template headword_params = [] for param in t.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4"] or re.search( "^(head|gen|g|decl)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) # Erase all params del t.params[:] # Add param1 t.add("1", param1) # Add explicit gender if needed if need_explicit_gender: explicit_genders = [] for ng in noun_gender: ng = ng[0] if ng not in explicit_genders: explicit_genders.append(ng) blib.set_param_chain(t, explicit_genders, "g", "g") # Copy remaining params from headword template for name, value, showkey in headword_params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "convert {{la-noun}}/{{la-proper noun}} params to new style") return unicode(parsed), notes
def process_page(index, page, save, verbose, fixdirecs): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] saw_paired_verb = False for t in parsed.filter_templates(): if unicode(t.name) == "ru-verb": saw_paired_verb = False if getparam(t, "2") in ["impf", "both"]: verb = getparam(t, "1") pfs = blib.fetch_param_chain(t, "pf", "pf") impfs = blib.fetch_param_chain(t, "impf", "impf") for otheraspect in pfs + impfs: if verb[0:2] == otheraspect[0:2]: saw_paired_verb = True if (unicode(t.name) in ["ru-conj", "ru-conj-old"] and getparam(t, "1") == "impf" and not saw_paired_verb): if getparam(t, "ppp") or getparam(t, "past_pasv_part"): pass elif [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) pass elif re.search(r"\+p|\[?\([78]\)\]?", getparam(t, "2"))): pass else: pagemsg("Apparent unpaired transitive imperfective without PPP") if pagetitle in fixdirecs: direc = fixdirecs[pagetitle] assert direc in ["fixed", "paired", "intrans", "+p", "|ppp=-"] origt = unicode(t) if direc == "+p": t.add("2", getparam(t, "2") + "+p") notes.append("add missing past passive participle to transitive unpaired imperfective verb") pagemsg("Add missing PPP, replace %s with %s" % (origt, unicode(t))) elif direc == "|ppp=-": t.add("ppp", "-") notes.append("note transitive unpaired imperfective verb as lacking past passive participle") pagemsg("Note no PPP, replace %s with %s" % (origt, unicode(t))) elif direc == "paired": pagemsg("Verb actually is paired") elif direc == "fixed": pagemsg("WARNING: Unfixed verb marked as fixed") elif direc == "intrans": pagemsg("WARNING: Transitive verb marked as intrans") new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") text = unicode(page.text) origtext = text notes = [] retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M) saw_a_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) la_noun_template = None la_ndecl_template = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "la-ndecl": if la_ndecl_template: pagemsg("WARNING: Saw multiple noun declension templates in subsection, %s and %s, skipping" % ( unicode(la_ndecl_template), unicode(t))) must_continue = True break la_ndecl_template = t saw_a_template = True if tn in ["la-noun", "la-proper noun", "la-location"] or ( tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") in ["noun", "proper noun"] ): if la_noun_template: pagemsg("WARNING: Saw multiple noun headword templates in subsection, %s and %s, skipping" % ( unicode(la_noun_template), unicode(t))) must_continue = True break la_noun_template = t saw_a_template = True if must_continue: continue if not la_noun_template and not la_ndecl_template: continue new_style_headword_template = ( la_noun_template and tname(la_noun_template) in ["la-noun", "la-proper noun"] and not getparam(la_noun_template, "head2") and not getparam(la_noun_template, "2") and not getparam(la_noun_template, "3") and not getparam(la_noun_template, "4") and not getparam(la_noun_template, "decl") ) if la_noun_template and not la_ndecl_template: if (tname(la_noun_template) in ["la-noun", "la-proper noun"] and getparam(la_noun_template, "indecl")): if new_style_headword_template: pagemsg("Found new-style indeclinable noun headword template, skipping: %s" % unicode(la_noun_template)) continue if (getparam(la_noun_template, "head2") or getparam(la_noun_template, "decl") or getparam(la_noun_template, "2") and getparam(la_noun_template, "2") != getparam(la_noun_template, "1") or not getparam(la_noun_template, "3")): pagemsg("WARNING: Found old-style indeclinable noun headword template and don't know how to convert: %s" % unicode(la_noun_template)) continue gender = getparam(la_noun_template, "3") orig_la_noun_template = unicode(la_noun_template) la_noun_template.add("g", gender[0], before="3") rmparam(la_noun_template, "3") rmparam(la_noun_template, "2") pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert indeclinable {{la-noun}}/{{la-proper noun}} template to new style") subsections[k] = unicode(parsed) continue else: pagemsg("WARNING: Saw noun headword template but no declension template: %s" % unicode(la_noun_template)) continue if la_ndecl_template and not la_noun_template: pagemsg("WARNING: Saw noun declension template but no headword template: %s" % unicode(la_ndecl_template)) continue orig_la_noun_template = unicode(la_noun_template) if new_style_headword_template: pagemsg("Found new-style noun headword template, skipping: %s" % orig_la_noun_template) continue def render_headword_and_decl(): return "headword template <from> %s <to> %s <end>, declension template <from> %s <to> %s <end>" % ( orig_la_noun_template, orig_la_noun_template, unicode(la_ndecl_template), unicode(la_ndecl_template) ) if tname(la_noun_template) == "head": explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["head", "head1"], "head") lemma = explicit_head_param_head or [pagetitle] elif tname(la_noun_template) == "la-location": explicit_head_param_head = [getparam(la_noun_template, "1")] else: explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["1", "head", "head1"], "head") lemma = explicit_head_param_head or [pagetitle] if "[[" in lemma[0]: if len(lemma) > 1: pagemsg("WARNING: Multiple lemmas %s and lemmas with links in them, can't handle, skipping: %s" % ( ",".join(lemma), render_headword_and_decl() )) continue ndecl_lemma = getparam(la_ndecl_template, "1") if "[[" not in ndecl_lemma: must_continue = False for m in re.finditer(r"(\[\[.*?\]\])", lemma[0]): link = m.group(1) plainlink = blib.remove_links(link) if plainlink not in ndecl_lemma: pagemsg("WARNING: Can't interpolate link %s into declension template, skipping: %s" % ( link, render_headword_and_decl())) must_continue = True break ndecl_lemma = ndecl_lemma.replace(plainlink, link, 1) if must_continue: continue new_ndecl_template = blib.parse_text(unicode(la_ndecl_template)).filter_templates()[0] new_ndecl_template.add("1", ndecl_lemma) pagemsg("Adding links to decl template %s to produce %s" % ( unicode(la_ndecl_template), unicode(new_ndecl_template))) la_ndecl_template = new_ndecl_template noun_props = new_generate_noun_forms(unicode(la_ndecl_template), errandpagemsg, expand_text, include_props=True) if noun_props is None: continue decl_gender = noun_props.get("g", None) if tname(la_noun_template) == "head": noun_gender = blib.fetch_param_chain(la_noun_template, ["g", "g1"], "g") if not noun_gender and not decl_gender: pagemsg("WARNING: No gender in {{head|la|...}} and no declension gender, can't proceed, skipping: %s" % render_headword_and_decl()) continue elif tname(la_noun_template) == "la-location": noun_gender = [getparam(la_noun_template, "4")] else: noun_gender = blib.fetch_param_chain(la_noun_template, ["3", "g", "g1"], "g") if not noun_gender: pagemsg("WARNING: No gender in old-style headword, skipping: %s" % render_headword_and_decl()) continue def do_compare_headword_decl_forms(id_slot, headword_forms, decl_slots, adjust_for_missing_gen_forms=False, remove_headword_links=False): return compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props, render_headword_and_decl(), pagemsg, adjust_for_missing_gen_forms=adjust_for_missing_gen_forms, remove_headword_links=remove_headword_links) def check_headword_vs_decl_decls(regularized_noun_decl): must_continue = False decl_lemma = getparam(la_ndecl_template, "1") if "((" in decl_lemma: pagemsg("WARNING: (( in decl_lemma, can't handle, skipping: %s" % render_headword_and_decl()) must_continue = True return segments = re.split(r"([^<> -]+<[^<>]*>)", decl_lemma) decl_decls = [] for i in xrange(1, len(segments) - 1, 2): m = re.search("^([^<> -]+)<([^<>]*)>$", segments[i]) stem_spec, decl_and_subtype_spec = m.groups() decl_and_subtypes = decl_and_subtype_spec.split(".") decl_decl = decl_and_subtypes[0] decl_decls.append(decl_decl) if set(regularized_noun_decl) != set(decl_decls): if set(regularized_noun_decl) <= set(decl_decls): pagemsg("headword decl %s subset of declension decl %s, allowing: %s" % ( ",".join(regularized_noun_decl), ",".join(decl_decls), render_headword_and_decl())) else: pagemsg("WARNING: headword decl %s not same as or subset of declension decl %s, skipping: %s" % ( ",".join(regularized_noun_decl), ",".join(decl_decls), render_headword_and_decl())) must_continue = True return must_continue def check_headword_vs_decl_gender(): must_continue = False if len(noun_gender) == 1 and noun_gender[0] == decl_gender: need_explicit_gender = False else: need_explicit_gender = True if len(noun_gender) > 1: pagemsg("WARNING: Saw multiple headword genders %s, please verify: %s" % ( ",".join(noun_gender), render_headword_and_decl())) elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")): pagemsg("WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % ( noun_gender[0], decl_gender, render_headword_and_decl())) must_continue = True return need_explicit_gender, must_continue def erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender): # Erase all params del la_noun_template.params[:] # Copy params from decl template for param in la_ndecl_template.params: pname = unicode(param.name) la_noun_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False) # Add explicit gender if needed if need_explicit_gender: explicit_genders = [] for ng in noun_gender: ng = ng[0] if ng not in explicit_genders: explicit_genders.append(ng) blib.set_param_chain(la_noun_template, explicit_genders, "g", "g") if tname(la_noun_template) == "head": if explicit_head_param_head and not do_compare_headword_decl_forms("lemma", explicit_head_param_head, ["linked_nom_sg", "linked_nom_pl"]): continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Check for extraneous {{head|la|...}} parameters must_continue = False is_proper_noun = getparam(la_ndecl_template, "2") == "proper noun" for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2"] or re.search("^(head|g)[0-9]*$", pname.strip()): continue pagemsg("WARNING: Saw extraneous param %s in {{head}} template, skipping: %s" % ( pname, render_headword_and_decl())) must_continue = True break if must_continue: continue # Copy params from decl template blib.set_template_name(la_noun_template, "la-proper noun" if is_proper_noun else "la-noun") erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{head|la|...}} to new-style {{la-noun}}/{{la-proper noun}} template") elif tname(la_noun_template) == "la-location": noun_decl = [getparam(la_noun_template, "6")] if not noun_decl: pagemsg("WARNING: No noun decl in {{la-location}}, skipping: %s" % render_headword_and_decl()) continue genitive = [getparam(la_noun_template, "2")] if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]): continue if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"], adjust_for_missing_gen_forms=True, remove_headword_links=True): continue regularized_noun_decl = [] must_continue = False for nd in noun_decl: if nd not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % ( nd, render_headword_and_decl())) must_continue = True break regularized_noun_decl.append(noun_decl_to_decl_type[nd]) if must_continue: continue must_continue = check_headword_vs_decl_decls(regularized_noun_decl) if must_continue: continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Check for extraneous {{la-location}} parameters must_continue = False for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4", "5", "6"]: continue pagemsg("WARNING: Saw extraneous param %s in {{la-location}} template, skipping: %s" % ( pname, render_headword_and_decl())) must_continue = True break if must_continue: continue blib.set_template_name(la_noun_template, "la-proper noun") erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{la-location}} to new-style {{la-proper noun}} template") else: # old-style {{la-noun}} or {{la-proper noun}} noun_decl = blib.fetch_param_chain(la_noun_template, ["4", "decl", "decl1"], "decl") if not noun_decl: pagemsg("WARNING: No noun decl in old-style headword, skipping: %s" % render_headword_and_decl()) continue genitive = blib.fetch_param_chain(la_noun_template, ["2", "gen", "gen1"], "gen") if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]): continue if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"], adjust_for_missing_gen_forms=True, remove_headword_links=True): continue regularized_noun_decl = [] must_continue = False for nd in noun_decl: if nd not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % ( nd, render_headword_and_decl())) must_continue = True break regularized_noun_decl.append(noun_decl_to_decl_type[nd]) if must_continue: continue must_continue = check_headword_vs_decl_decls(regularized_noun_decl) if must_continue: continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Fetch remaining params from headword template headword_params = [] for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4"] or re.search("^(head|gen|g|decl)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) # Copy remaining params from headword template for name, value, showkey in headword_params: la_noun_template.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{la-noun}}/{{la-proper noun}} params to new style") subsections[k] = unicode(parsed) if not saw_a_template: pagemsg("WARNING: Saw no noun headword or declension templates") secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found ru-noun+ or ru-proper noun+, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, [] noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] frobbed_manual_translit = [] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) # Retrieve headword translit and maybe transfer to decl headword_tr = getparam(headword_template, "tr") if headword_tr: if verbose: pagemsg("Found headword manual translit tr=%s" % headword_tr) if "," in headword_tr: pagemsg("WARNING: Comma in headword manual translit, skipping: %s" % headword_tr) return None # Punt if multi-arg-set, can't handle yet for decl_template in decl_templates: for param in decl_template.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template)) return None if val == "-" or val == "_" or val.startswith("join:"): pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template)) return None for i in xrange(2, 10): if getparam(headword_template, "tr%s" % i): pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % ( i, unicode(headword_template))) return None if runoun.arg1_is_stress(getparam(decl_template, "1")): lemma_arg = "2" else: lemma_arg = "1" lemmaval = getparam(decl_template, lemma_arg) if not lemmaval: lemmaval = subpagetitle if "//" in lemmaval: m = re.search("^(.*?)//(.*)$", lemmaval) if m.group(2) != headword_tr: pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % ( lemmaval, headword_tr)) return None else: pagemsg("Already found manual translit in decl template %s" % lemmaval) else: lemmaval += "//" + headword_tr orig_decl_template = unicode(decl_template) decl_template.add(lemma_arg, lemmaval) pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) frobbed_manual_translit = [headword_tr] genders = blib.fetch_param_chain(headword_template, "2", "g") bian_replaced = 0 # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in # headword template for decl_template in decl_templates: if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]: saw_in = -1 saw_an = -1 for i,g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0: orig_decl_template = unicode(decl_template) if saw_in < saw_an: pagemsg("Replacing a=bi with a=ia in decl template") decl_template.add("a", "ia") bian_replaced = 1 else: pagemsg("Replacing a=bi with a=ai in decl template") decl_template.add("a", "ai") bian_replaced = 1 pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) genders = runoun.check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg) if genders == None: return None new_params = [] for param in noun_table_template.params: new_params.append((param.name, param.value)) orig_headword_template = unicode(headword_template) params_to_preserve = runoun.fix_old_headword_params(headword_template, new_params, genders, pagemsg) if params_to_preserve == None: return None if unicode(headword_template.name) == "ru-proper noun": # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(headword_template, "n"): pagemsg("Adding n=both to headword tempate") headword_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) else: pagemsg("Removing n=sg from headword tempate") rmparam(headword_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) headword_template.params.extend(params_to_preserve) ru_noun_changed = 0 ru_proper_noun_changed = 0 if unicode(headword_template.name) == "ru-noun": headword_template.name = "ru-noun+" ru_noun_changed = 1 else: headword_template.name = "ru-proper noun+" ru_proper_noun_changed = 1 pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template))) return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit