def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Remove gender from adjective forms parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form": origt = unicode(t) rmparam(t, "g") rmparam(t, "g2") rmparam(t, "g3") rmparam(t, "g4") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("remove gender from adjective forms") sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, romaji_to_keep): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]: origt = unicode(t) # Remove old script code p1 = getparam(t, "1") if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]: pagemsg("Removing 1=%s: %s" % (p1, unicode(t))) notes.append("remove 1=%s from %s" % (p1, tname)) rmparam(t, "1") for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): param.name = str(int(pname) - 1) param.showkey = False # Convert hira= and/or kata= to numbered param. The complexity is # from ensuring that the numbered params always go before the # non-numbered ones. if t.has("hira") or t.has("kata"): # Fetch the numbered and non-numbered params, skipping blank # numbered ones and converting hira and kata to numbered numbered_params = [] non_numbered_params = [] for param in t.params: pname = unicode(param.name) if re.search(r"^[0-9]+$", pname): val = unicode(param.value) if val: numbered_params.append(val) elif pname not in ["hira", "kata"]: non_numbered_params.append((pname, param.value)) hira = getparam(t, "hira") if hira: numbered_params.append(hira) pagemsg("Moving hira=%s to %s=: %s" % (hira, len(numbered_params), unicode(t))) notes.append("move hira= to %s= in %s" % (len(numbered_params), tname)) kata = getparam(t, "kata") if kata: numbered_params.append(kata) pagemsg("Moving kata=%s to %s=: %s" % (kata, len(numbered_params), unicode(t))) notes.append("move kata= to %s= in %s" % (len(numbered_params), tname)) del t.params[:] # Put back numbered params, then non-numbered params. for i, param in enumerate(numbered_params): t.add(str(i+1), param) for name, value in non_numbered_params: t.add(name, value) # Remove rom= if not in list of pages to keep rom= if t.has("rom"): if pagetitle in romaji_to_keep: pagemsg("Keeping rom=%s because in romaji_to_keep: %s" % ( getparam(t, "rom"), unicode(t))) else: pagemsg("Removing rom=%s: %s" % (getparam(t, "rom"), unicode(t))) rmparam(t, "rom") notes.append("remove rom= from %s" % tname) # Remove hidx= if t.has("hidx"): pagemsg("Removing hidx=%s: %s" % (getparam(t, "hidx"), unicode(t))) rmparam(t, "hidx") notes.append("remove hidx= from %s" % tname) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Remove blank form codes and canonicalize position of lang=, tr= parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": origt = unicode(t) # Fetch the numbered params starting with 3, skipping blank ones numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) # Fetch param 1 and param 2, and non-numbered params except lang= # and nocat=. param1 = getparam(t, "1") param2 = getparam(t, "2") tr = getparam(t, "tr") nocat = getparam(t, "nocat") non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back lang, param 1, tr, param 2, then the replacements for the # higher numbered params, then the non-numbered params. t.add("lang", "ru") t.add("1", param1) if tr: t.add("tr", tr) t.add("2", param2) for i, param in enumerate(numbered_params): t.add(str(i+3), param) for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("removed any blank form codes and maybe rearranged lang=, tr=") if nocat: notes.append("removed nocat=") sections[j] = unicode(parsed) # Convert 'prep' to 'pre', etc. parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": for frm, to in [ ("nominative", "nom"), ("accusative", "acc"), ("genitive", "gen"), ("dative", "dat"), ("instrumental", "ins"), ("prep", "pre"), ("prepositional", "pre"), ("vocative", "voc"), ("locative", "loc"), ("partitive", "par"), ("singular", "s"), ("(singular)", "s"), ("plural", "p"), ("(plural)", "p"), ("inanimate", "in"), ("animate", "an"), ]: origt = unicode(t) for i in xrange(3,20): val = getparam(t, str(i)) if val == frm: t.add(str(i), to) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s' form code to '%s'" % (frm, to)) sections[j] = unicode(parsed) # Rearrange order of s|gen, p|nom etc. to gen|s, nom|p etc. parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": if (getparam(t, "3") in ["s", "p"] and getparam(t, "4") in ["nom", "gen", "dat", "acc", "ins", "pre", "voc", "loc", "par"] and not getparam(t, "5")): origt = unicode(t) number = getparam(t, "3") case = getparam(t, "4") t.add("3", case) t.add("4", number) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s|%s' to '%s|%s'" % (number, case, case, number)) sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, fix, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) notes = [] parsed = blib.parse(page) headword_aspects = set() found_multiple_headwords = False for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-verb": if headword_aspects: found_multiple_headwords = True headword_aspects = set() aspect = getparam(t, "2") if aspect in ["pf", "impf"]: headword_aspects.add(aspect) elif aspect == "both": headword_aspects.add("pf") headword_aspects.add("impf") elif aspect == "?": pagemsg("WARNING: Found aspect '?'") else: pagemsg("WARNING: Found bad aspect value '%s' in ru-verb" % aspect) elif tname in ["ru-conj", "ru-conj-old"]: aspect = re.sub("-.*", "", getparam(t, "1")) if aspect not in ["pf", "impf"]: pagemsg("WARNING: Found bad aspect value '%s' in ru-conj" % getparam(t, "1")) else: if not headword_aspects: pagemsg("WARNING: No ru-verb preceding ru-conj: %s" % unicode(t)) elif aspect not in headword_aspects: pagemsg("WARNING: ru-conj aspect %s not in ru-verb aspect %s" % (aspect, ",".join(headword_aspects))) if fix: if found_multiple_headwords: pagemsg("WARNING: Multiple ru-verb headwords, not fixing") elif not headword_aspects: pagemsg("WARNING: No ru-verb headwords, not fixing") elif len(headword_aspects) > 1: pagemsg("WARNING: Multiple aspects in ru-verb, not fixing") else: for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname in ["ru-conj", "ru-conj-old"]: param1 = getparam(t, "1") param1 = re.sub("^(pf|impf)((-.*)?)$", r"%s\2" % list(headword_aspects)[0], param1) t.add("1", param1) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("overrode conjugation aspect with %s" % list(headword_aspects)[0]) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'conjugation of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru": origt = unicode(t) t.name = "inflection of" newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted 'conjugation of' to 'inflection of'") sections[j] = unicode(parsed) # Try to split 'inflection of' containing 'present or future' into two # defns newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$", r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}", sections[j], 0, re.M) if newsec != sections[j]: notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'") sections[j] = newsec # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv', # 'impr' to 'imp' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": for frm, to in [ ("indc", "ind"), ("indicative", "ind"), ("futr", "fut"), ("future", "fut"), ("impr", "imp"), ("imperative", "imp"), ("perfective", "pfv"), ("(perfective)", "pfv"), ("imperfective", "impfv"), ("(imperfective)", "impfv"), ("singular", "s"), ("(singular)", "s"), ("plural", "p"), ("(plural)", "p"), ("masculine", "m"), ("(masculine)", "m"), ("feminine", "f"), ("(feminine)", "f"), ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"), ]: origt = unicode(t) for i in xrange(3,20): val = getparam(t, str(i)) if val == frm: t.add(str(i), to) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s' form code to '%s'" % (frm, to)) sections[j] = unicode(parsed) # Remove blank form codes and canonicalize position of lang=, tr= parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": origt = unicode(t) # Fetch the numbered params starting with 3, skipping blank ones numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) # Fetch param 1 and param 2, and non-numbered params except lang= # and nocat=. param1 = getparam(t, "1") param2 = getparam(t, "2") tr = getparam(t, "tr") nocat = getparam(t, "nocat") non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back lang, param 1, param 2, tr, then the replacements for the # higher numbered params, then the non-numbered params. t.add("lang", "ru") t.add("1", param1) t.add("2", param2) if tr: t.add("tr", tr) for i, param in enumerate(numbered_params): t.add(str(i+3), param) for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("removed any blank form codes and maybe rearranged lang=, tr=") if nocat: notes.append("removed nocat=") sections[j] = unicode(parsed) # Try to canonicalize 'inflection of' involving the imperative, # present, future parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) numparamset = set(numbered_params) canon_params = [] while True: if numparamset == {'s', 'pfv', 'imp'}: canon_params = ['2', 's', 'pfv', 'imp'] elif numparamset == {'s', 'impfv', 'imp'}: canon_params = ['2', 's', 'impfv', 'imp'] elif numparamset == {'s', 'imp'}: canon_params = ['2', 's', 'imp'] elif numparamset == {'p', 'pfv', 'imp'}: canon_params = ['2', 'p', 'pfv', 'imp'] elif numparamset == {'p', 'impfv', 'imp'}: canon_params = ['2', 'p', 'impfv', 'imp'] elif numparamset == {'p', 'imp'}: canon_params = ['2', 'p', 'imp'] elif numparamset == {'m', 's', 'past'}: canon_params = ['m', 's', 'past', 'ind'] elif numparamset == {'f', 's', 'past'}: canon_params = ['f', 's', 'past', 'ind'] elif numparamset == {'n', 's', 'past'}: canon_params = ['n', 's', 'past', 'ind'] elif numparamset == {'p', 'past'}: canon_params = ['p', 'past', 'ind'] else: m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr) if m: canon_params = [m.group(1), m.group(2), m.group(3), "ind"] break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified participial inflection def add_participle_inflection_of(m): prefix = m.group(1) tense = m.group(2).lower() if tense == "present": tense = "pres" voice = m.group(3).lower() if voice == "active": voice = "act" elif voice == "passive": voice = "pass" elif voice == "adverbial": voice = "adv" lemma = m.group(4) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice)) return retval newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified past inflection def add_past_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "") pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or "")) return retval newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified imperative inflection def add_imper_inflection_of(m): prefix = m.group(1) number = {"singular":"s", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for 2/%s/imp" % number) return retval newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified finite pres/fut inflection def add_pres_fut_inflection_of(m): prefix = m.group(1) person = m.group(2)[0] number = {"singular":"s", "plural":"p"}[m.group(3).lower()] tense = {"present":"pres", "future":"fut"}[m.group(4).lower()] lemma = m.group(5) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense)) return retval newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of, sections[j], 0, re.I) sections[j] = newsec new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_line(index, line, add_passive_of, override_etym, save, verbose): def error(text): errmsg("ERROR: Processing line: %s" % line) errmsg("ERROR: %s" % text) assert False def check_stress(word): word = re.sub(r"|.*", "", word) if word.startswith("-") or word.endswith("-"): # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать) return if rulib.needs_accents(word, split_dash=True): error("Word %s missing an accent" % word) # Skip lines consisting entirely of comments if line.startswith("#"): return if line.startswith("!"): override_etym = True line = line[1:] # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be # included as part of the second element. els = do_split(r"\s+", line, 1) if len(els) != 2: error("Expected two fields, saw %s" % len(els)) if not els[1].startswith("raw:"): els = do_split(r"\s+", line) # Replace _ with space and \u els = [el.replace("_", " ").replace(r"\u", "_") for el in els] if len(els) != 2: error("Expected two fields, saw %s" % len(els)) accented_term = els[0] term = rulib.remove_accents(accented_term) etym = els[1] pagetitle = term def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Handle etymology adjformtext = "" if etym == "?": error("Etymology consists of bare question mark") elif etym == "-": etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n" elif etym == "--": etymtext = "" elif re.search(r"^(part|adj|partadj)([fnp]):", etym): m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym) forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]} infleclines = ["# {{inflection of|lang=ru|%s||%s}}" % (m.group(3), form) for form in forms[m.group(2)]] if m.group(1) in ["adj", "partadj"]: adjinfltext = """===Adjective=== {{head|ru|adjective form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: adjinfltext = "" if m.group(1) in ["part", "partadj"]: partinfltext = """===Participle=== {{head|ru|participle form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: partinfltext = "" adjformtext = partinfltext + adjinfltext etymtext = "" else: if etym.startswith("acr:"): _, fullexpr, meaning = do_split(":", etym) etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning) elif etym.startswith("deverb:"): _, sourceterm = do_split(":", etym) etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm elif etym.startswith("back:"): _, sourceterm = do_split(":", etym) etymtext = "{{back-form|lang=ru|%s}}" % sourceterm elif etym.startswith("raw:"): etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym)) elif ":" in etym and "+" not in etym: if etym.startswith("?"): prefix = "Perhaps borrowed from " etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately borrowed from " etym = re.sub(r"^<<", "", etym) else: prefix = "Borrowed from " m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if not m: error("Bad etymology form: %s" % etym) etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2)) else: prefix = "" suffix = "" if etym.startswith("?"): prefix = "Perhaps from " suffix = "." etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately from " suffix = "." etym = re.sub(r"^<<", "", etym) m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if m: langtext = "|lang1=%s" % m.group(1) etym = m.group(2) else: langtext = "" etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix, "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext, suffix) etymbody = etymtext + "\n\n" etymtext = "===Etymology===\n" + etymbody if not etymtext: pagemsg("No etymology text, skipping") # Load page page = pywikibot.Page(site, pagetitle) if not blib.try_repeatedly(lambda: page.exists(), pagemsg, "check page existence"): pagemsg("Page doesn't exist, can't add etymology") return pagemsg("Adding etymology") notes = [] pagetext = unicode(page.text) # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": if override_etym: subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) replaced_etym = False for j in xrange(2, len(subsections), 2): if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]: subsections[j] = etymbody replaced_etym = True break if replaced_etym: sections[i] = "".join(subsections) newtext = "".join(sections) notes.append("replace Etymology section in Russian lemma with manually specified etymology") break if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]: errandpagemsg("WARNING: Already found etymology, skipping") return subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) insert_before = 1 if "===Alternative forms===" in subsections[insert_before]: insert_before += 2 subsections[insert_before] = etymtext + subsections[insert_before] sections[i] = "".join(subsections) if add_passive_of: active_term = rulib.remove_monosyllabic_accents( re.sub(u"с[яь]$", "", accented_term)) sections[i] = re.sub(r"(^(#.*\n)+)", r"\1# {{passive of|lang=ru|%s}}\n" % active_term, sections[i], 1, re.M) newtext = pagehead + "".join(sections) notes.append("add (manually specified) Etymology section to Russian lemma") break else: errandpagemsg("WARNING: Can't find Russian section, skipping") return if newtext != pagetext: if verbose: pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext)) assert notes comment = "; ".join(group_notes(notes)) if save: blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] already_canonicalized = False found_short_inflection_of = False warned_about_short = False foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'inflection of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): numbered_params.append(getparam(t, str(i))) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) canon_params = [] while True: m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", m.group(1), "s"] break m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", "p"] break if "short" in numbered_params or "short form" in numbered_params: found_short_inflection_of = True warned_about_short = True pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" % unicode(t)) break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) already_canonicalized = True sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified singular inflection def add_sing_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for short/%s/s" % gender) return retval newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of, sections[j], 0, re.I) if newsec != sections[j]: found_short_inflection_of = True sections[j] = newsec if "short" in sections[j] and not found_short_inflection_of: m = re.search("^(.*short.*)$", sections[j], re.M) warned_about_short = True pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" % (m and m.group(1) or "Can't get line?")) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) if not notes and not already_canonicalized: pagemsg("Skipping, no short form found%s" % ( warned_about_short and " (warning issued)" or " (no warning)"))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) def check_for_translation_italics(val, orig): val = val.replace("'', ''", ", ") if re.search("(?<!')''(?!')", val): pagemsg("WARNING: Italics in translation <<%s>>: <<%s>>" % (val, orig)) return val def check_for_stray_vertical_bar(val): split_on_paired_brackets_braces = re.split(r"\[\[[^\[\]]*\]\]|\{\{[^{}]*\}\}", val) for outside_bracket_brace in split_on_paired_brackets_braces: if "|" in outside_bracket_brace: pagemsg("WARNING: Stray vertical bar in Russian or English, can't handle: <<%s>>" % val) return True return False for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to convert multi-line usex using #: def multi_line_usex(m): ru, tr, en = m.groups() en = check_for_translation_italics(en, m.group(0)) if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(tr) or check_for_stray_vertical_bar(en): return m.group(0) retval = "#: {{ru-ux|%s|tr=%s|%s}}" % (ru, tr, en) pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval)) notes.append("converted raw multi-line usex to 'ru-ux'") return retval sections[j] = re.sub(r"^#: \{\{lang\|ru\|([^\n{}]*?)\}\}\n#:: (.*)\n#:::? (.*)$", multi_line_usex, sections[j], 0, re.M) # Try to convert multi-line usex using #* def multi_line_usex_hidden(m): ru, tr, en = m.groups() en = check_for_translation_italics(en, m.group(0)) if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(tr) or check_for_stray_vertical_bar(en): return m.group(0) retval = "%s#*: {{ru-ux|%s|tr=%s|%s}}" % (ru, tr, en) pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval)) notes.append("converted raw multi-line hidden usex to 'ru-ux'") return retval sections[j] = re.sub(r"^(#\* .*?\n)#\*: \{\{lang\|ru\|([^\n{}]*?)\}\}\n#\*:: ([^{}\n]*)\n#\*:::? ([^{}\n]*)$", multi_line_usex_hidden, sections[j], 0, re.M) # Try to convert single-line usex that uses {{lang}}, {{l}} or {{m}} for tempname in ["lang", "l", "m"]: def single_line_usex_lang_l_m(m): ru, en = m.groups() en = check_for_translation_italics(en, m.group(0)) if tempname == "lang" or "[" in ru: if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(en): return m.group(0) retval = "#: {{ru-ux|%s|%s|inline=y}}" % (ru, en) else: if "|tr=" in ru: pagemsg("WARNING: Found |tr= in link, can't handle: <<%s>>" % m.group(0)) return m.group(0) # A single vertical bar in ru is allowed here; it will be handled # correctly because we wrap it in a raw link if check_for_stray_vertical_bar(en): return m.group(0) retval = "#: {{ru-ux|[[%s]]|%s|inline=y}}" % (ru, en) pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval)) notes.append("converted raw single-line usex using {{%s}} to 'ru-ux'" % tempname) return retval # Version with ''...'' around the translation; do this first in case # we have bold (''') around the first Russian word and italics ('') # around the translation; in the opposite order, the bold will get # treated as italics sections[j] = re.sub(ur"^#:\*? \{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\ )(?:—|\—)(?: |\ )''(.*?)''$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M) # Version with ''...'' around the whole thing sections[j] = re.sub(ur"^#:\*? ''\{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\ )(?:—|\—)(?: |\ )(.*?)''$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M) # Version without ''...'' sections[j] = re.sub(ur"^#:\*? \{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\ )(?:—|\—)(?: |\ )(.*?)$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M) # Try to convert single-line usex that is raw, maybe allowing braces # in the right side for allow_braces_on_right in [False, True]: maybe_exclude_braces = "" if allow_braces_on_right else "{}" allow_braces_msg = ", allowing braces on right side" if allow_braces_on_right else "" def single_line_usex_raw(m): ru, en = m.groups() en = check_for_translation_italics(en, m.group(0)) if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(en): return m.group(0) retval = "#: {{ru-ux|%s|%s|inline=y}}" % (ru, en) pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval)) notes.append("converted pure raw single-line usex to 'ru-ux'%s" % allow_braces_msg) return retval # Version with ''...'' around the translation; do this first in case # we have bold (''') around the first Russian word and italics ('') # around the translation; in the opposite order, the bold will get # treated as italics sections[j] = re.sub(ur"^#:\*? ([^{}\n]*)(?: |\ )(?:—|-|\—)(?: |\ )''([^%s\n]*?)''$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M) # Version with ''...'' around the whole thing; the expression after # the '' is a disjunctive lookahead expression and says "(two single # quotes) either followed by 3 more quotes (combination bold+italic) # or not followed by any quote (to exclude bold = ''') sections[j] = re.sub(ur"^#:\*? ''(?:(?!')|(?='''))([^{}\n]*)(?: |\ )(?:—|-|\—)(?: |\ )([^%s\n]*?)''$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M) # Version without ''...'' sections[j] = re.sub(ur"^#:\*? ([^{}\n]*)(?: |\ )(?:—|-|\—)(?: |\ )([^%s\n]*?)$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <<%s>> with <<%s>>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "noun form": if getparam(t, "3"): pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" % unicode(t)) return rmparam(t, "1") rmparam(t, "2") head = getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return t.name = "ru-noun form" if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("convert {{head|ru|noun form}} to {{ru-noun form}}") elif unicode(t.name) == "ru-noun form": if getparam(t, "head") and getparam(t, "1"): pagemsg("WARNING: ru-noun form has both params 1= and head=: %s" % unicode(t)) return if getparam(t, "g") and getparam(t, "2"): pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" % unicode(t)) return head = getrmparam(t, "1") or getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "2") or getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalize ru-noun form") new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if not page.exists(): pagemsg("WARNING: Page doesn't exist") return if ":" in pagetitle and not re.search( "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle): pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page") return text = unicode(page.text) notes = [] subsections = re.split("(^==.*==\n)", text, 0, re.M) newtext = text def move_param(t, fr, to, frob_from=None): if t.has(fr): oldval = getparam(t, fr) if not oldval.strip(): rmparam(t, fr) pagemsg("Removing blank param %s" % fr) return if frob_from: newval = frob_from(oldval) if not newval or not newval.strip(): return else: newval = oldval if getparam(t, to).strip(): pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s" % (fr, to, to, unicode(t))) elif oldval != newval: rmparam(t, to) # in case of blank param # If either old or new name is a number, use remove/add to automatically set the # showkey value properly; else it's safe to just change the name of the param, # which will preserve its location. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: tfr = t.get(fr) tfr.name = to tfr.value = newval pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to, newval.replace("\n", r"\n"))) else: rmparam(t, to) # in case of blank param # See comment above. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: t.get(fr).name = to pagemsg("%s -> %s" % (fr, to)) def fix_page_params(t): origt = unicode(t) for param in ["page", "pages"]: pageval = getparam(t, param) if re.search(r"^\s*pp?\.\s*", pageval): pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval) t.add(param, pageval) notes.append("remove p(p). from %s=" % param) pagemsg("remove p(p). from %s=" % param) if re.search(r"^[0-9]+$", getparam(t, "pages").strip()): move_param(t, "pages", "page") if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()): move_param(t, "page", "pages") return origt != unicode(t) def fix_cite_book_params(t): origt = unicode(t) if getparam(t, "origyear").strip() and getparam(t, "year").strip(): if getparam(t, "year_published"): pagemsg("WARNING: Would set year_published= but is already present: %s" % unicode(t)) else: rmparam(t, "year_published") # in case of blank param t.get("year").name = "year_published" t.get("origyear").name = "year" pagemsg("year -> year_published, origyear -> year") move_param(t, "origdate", "date") move_param(t, "origmonth", "month") def frob_isbn(idval): isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)" if re.search(isbn_re, idval, re.I): return re.sub(isbn_re, r"\1", idval, 0, re.I) elif re.search(r"^[0-9]", idval.strip()): return idval else: pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" % idval.replace("\n", r"\n")) return None move_param(t, "id", "isbn", frob_isbn) fix_page_params(t) return origt != unicode(t) def fix_cite_usenet_params(t): origt = unicode(t) move_param(t, "group", "newsgroup") move_param(t, "link", "url") return origt != unicode(t) def fix_quote_usenet_params(t): origt = unicode(t) monthday = getparam(t, "monthday").strip() year = getparam(t, "year").strip() if monthday and year: if getparam(t, "date"): pagemsg("WARNING: Would set date= but is already present: %s" % unicode(t)) else: rmparam(t, "date") # in case of blank param param = t.get("monthday") param.name = "date" if re.search("^[0-9]+/[0-9]+$", monthday): param.value = "%s/%s" % (monthday, year) else: param.value = "%s %s" % (monthday, year) rmparam(t, "year") pagemsg("monthday/year -> date") move_param(t, "group", "newsgroup") move_param(t, "text", "passage") move_param(t, "6", "passage") move_param(t, "5", "url") move_param(t, "4", "newsgroup") move_param(t, "3", "title") move_param(t, "2", "author") move_param(t, "1", "date") return origt != unicode(t) def replace_in_reference(parsed, in_what): for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "cite-journal", tname) pagemsg("%s -> cite-journal" % tname.strip()) notes.append("%s -> cite-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) if tname.strip() == "reference-book": set_template_name(t, "cite-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) for j in xrange(0, len(subsections), 2): parsed = blib.parse_text(subsections[j]) if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]): replace_in_reference(parsed, "==References== section") subsections[j] = unicode(parsed) else: for t in parsed.filter_tags(): if unicode(t.tag) == "ref": tagparsed = mw.wikicode.Wikicode([t]) replace_in_reference(tagparsed, "<ref>") subsections[j] = unicode(parsed) need_to_replace_double_quote_prefixes = False for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) for fr, to in simple_replace: if tname.strip() == fr: set_template_name(t, to, tname) pagemsg("%s -> %s" % (fr, to)) notes.append("%s -> %s" % (fr, to)) fix_page_params(t) pagemsg("Replacing %s with %s" % (origt, unicode(t))) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "quote-journal", tname) pagemsg("%s -> quote-journal" % tname.strip()) notes.append("%s -> quote-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() == "reference-book": set_template_name(t, "quote-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() in ["cite-usenet", "quote-usenet"]: if tname.strip() == "cite-usenet": fixed_params = fix_cite_usenet_params(t) else: fixed_params = fix_quote_usenet_params(t) set_template_name(t, "quote-newsgroup", tname) pagemsg("%s -> quote-newsgroup" % tname.strip()) prefix = getparam(t, "prefix").strip() removed_prefix = False if prefix: if prefix in ["#", "#*"]: parsed.insert_before(t, "#* ") rmparam(t, "prefix") pagemsg("remove prefix=%s, insert #* before template" % prefix) need_to_replace_double_quote_prefixes = True removed_prefix = True else: pagemsg("WARNING: Found prefix=%s, not # or #*: %s" % (prefix, unicode(t))) notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(), removed_prefix and ", remove prefix=%s, insert #* before template" % prefix or "", fixed_params and ", fix params" or "")) pagemsg("Replacing %s with %s" % (origt, unicode(t))) subsections[j] = unicode(parsed) if need_to_replace_double_quote_prefixes: newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M) if newval != subsections[j]: notes.append("remove double #* prefix") pagemsg("Removed double #* prefix") subsections[j] = newval newtext = "".join(subsections) if text != newtext: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, newtext)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)