def rewrite_pages(refrom, reto, refs, cat, pages, pagefile, pagetitle_sub, comment, filter_pages, save, verbose, startFrom, upTo): def rewrite_one_page(page, index, text): #blib.msg("From: [[%s]], To: [[%s]]" % (refrom, reto)) text = unicode(text) text = reorder_shadda(text) zipped_fromto = zip(refrom, reto) for fromval, toval in zipped_fromto: if pagetitle_sub: pagetitle = unicode(page.title()) fromval = fromval.replace(pagetitle_sub, re.escape(pagetitle)) toval = toval.replace(pagetitle_sub, pagetitle) text = re.sub(fromval, toval, text) return text, comment or "replace %s" % (", ".join("%s -> %s" % (f, t) for f, t in zipped_fromto)) if pages: pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(pages, startFrom, upTo)) elif pagefile: lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")] pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(lines, startFrom, upTo)) elif refs: pages = blib.references(refs, startFrom, upTo, includelinks=True) else: pages = blib.cat_articles(cat, startFrom, upTo) for page, index in pages: pagetitle = unicode(page.title()) if filter_pages and not re.search(filter_pages, pagetitle): blib.msg("Skipping %s because doesn't match --filter-pages regex %s" % (pagetitle, filter_pages)) else: if verbose: blib.msg("Processing %s" % pagetitle) blib.do_edit(page, index, rewrite_one_page, save=save, verbose=verbose)
def do_pages(createfn, iterfn=iter_pages): pages = iterfn(createfn) for current, index in blib.iter_pages(pages, startFrom, upTo, key=lambda x: x[0]): pagename, text, changelog = current pagetitle = remove_diacritics(pagename) if params.offline: msg("Text for %s: [[%s]]" % (pagename, text)) msg("Changelog = %s" % changelog) else: page = pywikibot.Page(site, pagetitle) if page.exists(): msg("Page %s %s: WARNING, page already exists, skipping" % (index, pagename)) else: def save_text(page, index, parsed): return text, changelog blib.do_edit(page, index, save_text, save=params.save, verbose=params.verbose)
def undo_greek_removal(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.match(r"\* \[\[(.*?)]]: Removed (.*?)=.*?: <nowiki>(.*?)</nowiki>$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages(template_removals, startFrom, upTo, # key is the page name key = lambda x: x[0]): pagename, removed_param, template_text = current def undo_one_page_greek_removal(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) template = blib.parse_text(template_text).filter_templates()[0] orig_template = unicode(template) if getparam(template, "sc") == "polytonic": template.remove("sc") to_template = unicode(template) param_value = getparam(template, removed_param) template.remove(removed_param) from_template = unicode(template) text = unicode(text) found_orig_template = orig_template in text newtext = text.replace(from_template, to_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg("WARNING: Unable to locate 'from' template when undoing Greek param removal: %s" % from_template) else: pagemsg("Original template found, taking no action") else: if found_orig_template: pagemsg("WARNING: Undid removal, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(to_template) - len(from_template): pagemsg("WARNING: Length mismatch when undoing Greek param removal, may have matched multiple templates: from=%s, to=%s" % ( from_template, to_template)) changelog = "Undid removal of %s=%s in %s" % (removed_param, param_value, to_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % ( index, pagename)) else: blib.do_edit(page, index, undo_one_page_greek_removal, save=save, verbose=verbose)
def parse_log_file(fn, startFrom, upTo): for current, index in blib.iter_pages(yield_page_lines(fn), startFrom, upTo, key=lambda x:x[1]): pageindex, pagename, lines = current for line in lines: m = re.match(r"^Page ([0-9/.-]+) (.*)$", line) if m: msg("Page %s/%s %s" % (pageindex, m.group(1), m.group(2))) else: msg(line)
def undo_ru_auto_accent(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.search(r"^Page [0-9]+ (.*?): Replaced (\{\{.*?\}\}) with (\{\{.*?\}\})$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages(template_removals, startFrom, upTo, # key is the page name key = lambda x: x[0]): pagename, orig_template, repl_template = current if not re.search(r"^\{\{(ux|usex|ru-ux|lang)\|", orig_template): continue def undo_one_page_ru_auto_accent(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) text = unicode(text) if not re.search("^#\*:* *%s" % re.escape(repl_template), text, re.M): return None, "" found_orig_template = orig_template in text newtext = text.replace(repl_template, orig_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg("WARNING: Unable to locate 'repl' template when undoing Russian auto-accenting: %s" % repl_template) else: pagemsg("Original template found, taking no action") else: pagemsg("Replaced %s with %s" % (repl_template, orig_template)) if found_orig_template: pagemsg("WARNING: Undid replacement, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(orig_template) - len(repl_template): pagemsg("WARNING: Length mismatch when undoing Russian auto-accenting, may have matched multiple templates: orig=%s, repl=%s" % ( orig_template, repl_template)) changelog = "Undid auto-accenting (per Wikitiki89) of %s" % (orig_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % ( index, pagename)) else: blib.do_edit(page, index, undo_one_page_ru_auto_accent, save=save, verbose=verbose)
def find_russian_need_vowels(find_accents, cattype, direcfile, save, verbose, startFrom, upTo): if direcfile: processing_lines = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.match(r"^(Page [^ ]+ )(.*?)(: .*?:) Processing: (\{\{.*?\}\})( <- \{\{.*?\}\} \(\{\{.*?\}\}\))$", line) if m: processing_lines.append(m.groups()) for current, index in blib.iter_pages(processing_lines, startFrom, upTo, # key is the page name key = lambda x:x[1]): pagenum, pagename, tempname, repltext, rest = current def pagemsg(text): msg("Page %s(%s) %s: %s" % (pagenum, index, pagetitle, text)) def check_template_for_missing_accent(pagetitle, index, template, ruparam, trparam): def output_line(directive): msg("* %s[[%s]]%s %s: <nowiki>%s%s</nowiki>" % (pagenum, pagename, tempname, directive, unicode(template), rest)) return process_template(pagetitle, index, template, ruparam, trparam, output_line, find_accents, verbose) blib.process_links(save, verbose, "ru", "Russian", "pagetext", None, None, check_template_for_missing_accent, join_actions=join_changelog_notes, split_templates=None, pages_to_do=[(pagename, repltext)], quiet=True) if index % 100 == 0: output_stats(pagemsg) else: def check_template_for_missing_accent(pagetitle, index, template, ruparam, trparam): def pagemsg(text): msg("Page %s %s: %s" % (index, pagetitle, text)) def output_line(directive): pagemsg("%s: %s" % (directive, unicode(template))) result = process_template(pagetitle, index, template, ruparam, trparam, output_line, find_accents, verbose) if index % 100 == 0: output_stats(pagemsg) return result blib.process_links(save, verbose, "ru", "Russian", cattype, startFrom, upTo, check_template_for_missing_accent, join_actions=join_changelog_notes, split_templates=None)
for ten in sorted(cardinal_tens.keys())[:-1]: # Skip 100 for one in sorted(cardinal_ones.keys())[1:]: # Skip 0 yield ten + one def iter_specified_numerals(spec): for singlespec in re.split(",", spec): if "-" in singlespec: fro, to = re.split("-", singlespec) for num in range(int(fro), int(to) + 1): yield num else: yield int(singlespec) if params.numerals: pages = iter_specified_numerals(params.numerals) else: pages = iter_numerals() for current, index in blib.iter_pages(pages, startFrom, upTo, key=lambda x: str(x)): if params.offline: print "========== Text for #%s: ==========" % current print "" print generate_page(current).encode('utf-8') print "" else: process_page(index, current, params.save, params.verbose, params)
def undo_greek_removal(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.match( r"\* \[\[(.*?)]]: Removed (.*?)=.*?: <nowiki>(.*?)</nowiki>$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages( template_removals, startFrom, upTo, # key is the page name key=lambda x: x[0]): pagename, removed_param, template_text = current def undo_one_page_greek_removal(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) template = blib.parse_text(template_text).filter_templates()[0] orig_template = unicode(template) if getparam(template, "sc") == "polytonic": template.remove("sc") to_template = unicode(template) param_value = getparam(template, removed_param) template.remove(removed_param) from_template = unicode(template) text = unicode(text) found_orig_template = orig_template in text newtext = text.replace(from_template, to_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg( "WARNING: Unable to locate 'from' template when undoing Greek param removal: %s" % from_template) else: pagemsg("Original template found, taking no action") else: if found_orig_template: pagemsg( "WARNING: Undid removal, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(to_template) - len( from_template): pagemsg( "WARNING: Length mismatch when undoing Greek param removal, may have matched multiple templates: from=%s, to=%s" % (from_template, to_template)) changelog = "Undid removal of %s=%s in %s" % ( removed_param, param_value, to_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % (index, pagename)) else: blib.do_edit(page, index, undo_one_page_greek_removal, save=save, verbose=verbose)
def push_manual_changes(save, verbose, diff, direcfile, annotation, startFrom, upTo): template_changes = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() repl_on_right = False m = re.match( r"^Page [^ ]+ (.*?): .*?: (\{\{.*?\}\}) <- \{\{.*?\}\} \((\{\{.*?\}\})\)$", line) if not m: m = re.match( r"^\* (?:Page [^ ]+ )?\[\[(.*?)\]\]: .*?: <nowiki>(\{\{.*?\}\}) <- \{\{.*?\}\} \((\{\{.*?\}\})\)</nowiki>.*$", line) if not m: m = re.match(r"^(?:Page [^ ]+ )(.*?): .* /// (.*?) /// (.*?)$", line) repl_on_right = True if m: if m.group(2) != m.group(3): # If the current template is the same as the current template of the # previous entry, ignore the previous entry; otherwise we won't be # able to locate the current template the second time around. This # happens e.g. in the output of find_russian_need_vowels.py when # processing a template such as cardinalbox or compound that has # more than one foreign-language parameter in it. if len(template_changes ) > 0 and template_changes[-1][2] == m.group(3): msg("Ignoring change for pagename %s, %s -> %s" % template_changes[-1]) template_changes.pop() if repl_on_right: pagename, curr, repl = m.groups() template_changes.append((pagename, repl, curr)) else: template_changes.append(m.groups()) else: msg("WARNING: Ignoring line with from=to: %s" % line) else: mpage = re.search(r"^(?:Page [^ ]+ )(.*?): (.*)$", line) if not mpage: msg("WARNING: Unable to parse line: [%s]" % line) continue pagename, directives = mpage.groups() for m in re.finditer("<from> (.*?) <to> (.*?) <end>", directives): curr, repl = m.groups() if curr != repl: template_changes.append((pagename, repl, curr)) else: msg("WARNING: Ignoring line with from=to: %s" % line) for current, index in blib.iter_pages( template_changes, startFrom, upTo, # key is the page name key=lambda x: x[0]): pagename, repl_template, curr_template = current def push_one_manual_change(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) #template = blib.parse_text(template_text).filter_templates()[0] #orig_template = unicode(template) #if getparam(template, "sc") == "polytonic": # template.remove("sc") #to_template = unicode(template) #param_value = getparam(template, removed_param) #template.remove(removed_param) #from_template = unicode(template) text = unicode(text) found_repl_template = repl_template in text newtext = text.replace(curr_template, repl_template) changelog = "" if newtext == text: if not found_repl_template: pagemsg("WARNING: Unable to locate current template: %s" % curr_template) else: pagemsg( "Replacement template already found, taking no action") else: if found_repl_template: pagemsg( "WARNING: Made change, but replacement template %s already present!" % repl_template) repl_curr_diff = len(repl_template) - len(curr_template) newtext_text_diff = len(newtext) - len(text) if newtext_text_diff == repl_curr_diff: pass elif repl_curr_diff == 0: if newtext_text_diff != 0: pagemsg( "WARNING: Something wrong, no change in text length during replacement but expected change: Expected length change=%s, actual=%s, curr=%s, repl=%s" % (repl_curr_diff, newtext_text_diff, curr, repl)) else: ratio = float(newtext_text_diff) / repl_curr_diff if ratio == int(ratio): pagemsg( "WARNING: Replaced %s occurrences of curr=%s with repl=%s" % (int(ratio), curr_template, repl_template)) else: pagemsg( "WARNING: Something wrong, length mismatch during replacement: Expected length change=%s, actual=%s, ratio=%.2f, curr=%s, repl=%s" % (repl_curr_diff, newtext_text_diff, ratio, curr_template, repl_template)) changelog = "replace <%s> with <%s> (%s)" % (truncate( curr_template), truncate(repl_template), annotation) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % (index, pagename)) else: blib.do_edit(page, index, push_one_manual_change, save=save, verbose=verbose, diff=diff)
def undo_ru_auto_accent(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.search( r"^Page [0-9]+ (.*?): Replaced (\{\{.*?\}\}) with (\{\{.*?\}\})$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages( template_removals, startFrom, upTo, # key is the page name key=lambda x: x[0]): pagename, orig_template, repl_template = current if not re.search(r"^\{\{(ux|usex|ru-ux|lang)\|", orig_template): continue def undo_one_page_ru_auto_accent(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) text = unicode(text) if not re.search("^#\*:* *%s" % re.escape(repl_template), text, re.M): return None, "" found_orig_template = orig_template in text newtext = text.replace(repl_template, orig_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg( "WARNING: Unable to locate 'repl' template when undoing Russian auto-accenting: %s" % repl_template) else: pagemsg("Original template found, taking no action") else: pagemsg("Replaced %s with %s" % (repl_template, orig_template)) if found_orig_template: pagemsg( "WARNING: Undid replacement, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(orig_template) - len( repl_template): pagemsg( "WARNING: Length mismatch when undoing Russian auto-accenting, may have matched multiple templates: orig=%s, repl=%s" % (orig_template, repl_template)) changelog = "Undid auto-accenting (per Wikitiki89) of %s" % ( orig_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % (index, pagename)) else: blib.do_edit(page, index, undo_one_page_ru_auto_accent, save=save, verbose=verbose)
def push_manual_changes(save, verbose, direcfile, annotation, startFrom, upTo): template_changes = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.match(r"^Page [^ ]+ (.*?): .*?: (\{\{.*?\}\}) <- \{\{.*?\}\} \((\{\{.*?\}\})\)$", line) if not m: m = re.match(r"^\* (?:Page [^ ]+ )?\[\[(.*?)\]\]: .*?: <nowiki>(\{\{.*?\}\}) <- \{\{.*?\}\} \((\{\{.*?\}\})\)</nowiki>.*$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) continue if m.group(2) != m.group(3): # If the current template is the same as the current template of the # previous entry, ignore the previous entry; otherwise we won't be # able to locate the current template the second time around. This # happens e.g. in the output of find_russian_need_vowels.py when # processing a template such as cardinalbox or compound that has # more than one foreign-language parameter in it. if len(template_changes) > 0 and template_changes[-1][2] == m.group(3): msg("Ignoring change for pagename %s, %s -> %s" % template_changes[-1]) template_changes.pop() template_changes.append(m.groups()) for current, index in blib.iter_pages(template_changes, startFrom, upTo, # key is the page name key = lambda x: x[0]): pagename, repl_template, curr_template = current def push_one_manual_change(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) #template = blib.parse_text(template_text).filter_templates()[0] #orig_template = unicode(template) #if getparam(template, "sc") == "polytonic": # template.remove("sc") #to_template = unicode(template) #param_value = getparam(template, removed_param) #template.remove(removed_param) #from_template = unicode(template) text = unicode(text) found_repl_template = repl_template in text newtext = text.replace(curr_template, repl_template) changelog = "" if newtext == text: if not found_repl_template: pagemsg("WARNING: Unable to locate current template: %s" % curr_template) else: pagemsg("Replacement template already found, taking no action") else: if found_repl_template: pagemsg("WARNING: Made change, but replacement template %s already present!" % repl_template) repl_curr_diff = len(repl_template) - len(curr_template) newtext_text_diff = len(newtext) - len(text) if newtext_text_diff == repl_curr_diff: pass else: ratio = float(newtext_text_diff) / repl_curr_diff if ratio == int(ratio): pagemsg("WARNING: Replaced %s occurrences of curr=%s with repl=%s" % (int(ratio), curr_template, repl_template)) else: pagemsg("WARNING: Something wrong, length mismatch during replacement: Expected length change=%s, actual=%s, ratio=%.2f, curr=%s, repl=%s" % (repl_curr_diff, newtext_text_diff, ratio, curr_template, repl_template)) changelog = "Replaced %s with %s (%s)" % (curr_template, repl_template, annotation) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % ( index, pagename)) else: blib.do_edit(page, index, push_one_manual_change, save=save, verbose=verbose)