def process_lemma(index, pagetitle, slots, program_args): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose) page = pywikibot.Page(site, pagetitle) parsed = blib.parse(page) for t in parsed.filter_templates(): tn = tname(t) pos = None if tn == "la-conj": pos = "verb" elif tn == "la-ndecl": pos = "noun" elif tn == "la-adecl": pos = "adj" if pos: args = lalib.generate_infl_forms(pos, unicode(t), errandpagemsg, expand_text) for slot in args: matches = False for spec in slots: if spec == slot: matches = True break if lalib.slot_matches_spec(slot, spec): matches = True break if matches: for formpagename in re.split(",", args[slot]): if "[" in formpagename or "|" in formpagename: pagemsg("WARNING: Skipping page %s with links in it" % formpagename) else: formpagename = lalib.remove_macrons(formpagename) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Skipping dictionary form") else: def do_process_page(page, index, parsed): return process_page(index, page, program_args) blib.do_edit(formpage, index, do_process_page, save=program_args.save, verbose=program_args.verbose, diff=program_args.diff)
def process_page(index, pos, lemma, subs, infl, save, verbose): def pagemsg(txt): msg("Page %s %s: %s" % (index, lemma, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, lemma, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose) pagemsg("Processing") args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text) if args is None: return forms_to_delete = [] for key, form in args.iteritems(): forms_to_delete.extend(form.split(",")) for formind, form in blib.iter_items(forms_to_delete): def handler(page, formind, parsed): return process_form(index, page, lemma, formind, form, subs) blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)
def lookup_inflection(lemma_no_macrons, pos, expected_headtemps, expected_infltemps, pagemsg, errandpagemsg): global args lemma_pagetitle = lemma_no_macrons if lemma_pagetitle.startswith("*"): lemma_pagetitle = "Reconstruction:Latin/" + lemma_pagetitle[1:] orig_pagemsg = pagemsg orig_errandpagemsg = errandpagemsg def pagemsg(txt): orig_pagemsg("%s: %s" % (lemma_no_macrons, txt)) def errandpagemsg(txt): orig_errandpagemsg("%s: %s" % (lemma_no_macrons, txt)) def expand_text(tempcall): cache_key = (tempcall, lemma_pagetitle) if cache_key in expand_text_cache: retval = expand_text_cache[cache_key] if args.verbose: pagemsg("Found (%s, %s)=%s in expand_text_cache" % (tempcall, lemma_pagetitle, retval)) return retval if args.verbose: pagemsg("Couldn't find (%s, %s) in expand_text_cache" % (tempcall, lemma_pagetitle)) result = blib.expand_text(tempcall, lemma_pagetitle, pagemsg, args.verbose) expand_text_cache[cache_key] = result return result if lemma_pagetitle in heads_and_defns_cache: if args.verbose: pagemsg("Found %s in heads_and_defns_cache" % lemma_pagetitle) retval = heads_and_defns_cache[lemma_pagetitle] else: if args.verbose: pagemsg("Couldn't find %s in heads_and_defns_cache" % lemma_pagetitle) page = pywikibot.Page(site, lemma_pagetitle) try: exists = blib.try_repeatedly(lambda: page.exists(), pagemsg, "determine if page exists") except pywikibot.exceptions.InvalidTitle as e: pagemsg("WARNING: Invalid title %s, skipping" % lemma_pagetitle) heads_and_defns_cache[lemma_pagetitle] = "nonexistent" traceback.print_exc(file=sys.stdout) return None if not exists: pagemsg("WARNING: Lemma %s doesn't exist" % lemma_no_macrons) heads_and_defns_cache[lemma_pagetitle] = "nonexistent" return None retval = lalib.find_heads_and_defns(unicode(page.text), pagemsg) heads_and_defns_cache[lemma_pagetitle] = retval if retval == "nonexistent": pagemsg("WARNING: Lemma %s doesn't exist (cached)" % lemma_no_macrons) return None if retval is None: return None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval matched_head = False inflargs_sets = [] seen_heads = [] seen_infltns = [] for headword in headwords: ht = headword['head_template'] tn = tname(ht) heads = lalib.la_get_headword_from_template(ht, lemma_pagetitle, pagemsg, expand_text) for head in heads: if head not in seen_heads: seen_heads.append(head) for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn not in seen_infltns: seen_infltns.append(infltn) if tn in expected_headtemps: oright = unicode(ht) for head in heads: head_no_links = blib.remove_links(head) if lalib.remove_macrons(head_no_links) == lemma_no_macrons: break else: # no break continue this_inflargs = [] for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn not in expected_infltemps: pagemsg( "WARNING: Saw bad declension template for %s, expected one of {{%s}}: %s" % (pos, ",".join( "{{%s}}" % temp for temp in expected_infltemps), unicode(inflt))) continue originflt = unicode(inflt) inflargs = lalib.generate_infl_forms(pos, originflt, errandpagemsg, expand_text) if inflargs is None: continue this_inflargs.append(inflargs) matched_head = True inflargs_sets.append((heads, this_inflargs)) if not matched_head: pagemsg( "WARNING: Couldn't find any matching heads, even allowing macron differences (seen heads %s, seen infl template names %s)" % (",".join(seen_heads), ",".join(seen_infltns))) return None return inflargs_sets
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_heads_and_defns(text, pagemsg) if retval is None: return None, None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval part_headwords = [] adj_headwords = [] pn_headwords = [] noun_headwords = [] for headword in headwords: ht = headword['head_template'] tn = tname(ht) if tn == "la-part" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["participle", "participles"]: part_headwords.append(headword) elif tn == "la-adj" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["adjective", "adjectives"]: adj_headwords.append(headword) elif tn == "la-proper noun" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["proper noun", "proper nouns"]: pn_headwords.append(headword) elif tn == "la-noun" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]: noun_headwords.append(headword) headwords_to_do = None if part_headwords and not adj_headwords: pos = "part" headwords_to_do = part_headwords expected_inflt = "la-adecl" elif pn_headwords and not noun_headwords: pos = "pn" headwords_to_do = pn_headwords expected_inflt = "la-ndecl" if not headwords_to_do: return None, None for headword in headwords_to_do: for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn != expected_inflt: pagemsg( "WARNING: Saw bad declension template for %s, expected {{%s}}: %s" % (pos, expected_inflt, unicode(inflt))) continue inflargs = lalib.generate_infl_forms(pos, unicode(inflt), errandpagemsg, expand_text) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in inflargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for formindex, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, formindex, parsed): return process_form(page, formindex, slot, form, pos, pagemsg) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), "%s.%s" % (index, formindex), handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_page(index, lemma, pos, infl, slots, pages_to_delete, preserve_diaeresis, save, verbose, diff): def pagemsg(txt): msg("Page %s %s: %s" % (index, lemma, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, lemma, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma, preserve_diaeresis), pagemsg, verbose) pagemsg("Processing") args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text, add_sync_verb_forms=True) if args is None: return forms_to_delete = [] tag_sets_to_delete = [] lemma_no_macrons = remove_macrons(lemma) def add_bad_forms(bad_slot_fun): for slot, formspec in args.iteritems(): if bad_slot_fun(slot): tag_sets_to_delete.append(lalib.slot_to_tag_set(slot)) forms_to_delete.append((slot, formspec)) for slot in slots.split(","): if slot.startswith("@"): if ":" in slot: real_form, real_slot = slot[1:].split(":") tag_sets_to_delete.append(lalib.slot_to_tag_set(real_slot)) forms_to_delete.append((real_slot, real_form)) else: forms_to_delete.append((None, slot[1:])) elif slot in args: tag_sets_to_delete.append(lalib.slot_to_tag_set(slot)) forms_to_delete.append((slot, args[slot])) elif slot == "allbutlemma": for sl, formspec in args.iteritems(): forms = formspec.split(",") forms = [ form for form in forms if lemma_no_macrons != remove_macrons(form) ] if forms: tag_sets_to_delete.append(lalib.slot_to_tag_set(sl)) forms_to_delete.append((sl, ",".join(forms))) else: add_bad_forms(lambda sl: lalib.slot_matches_spec(sl, slot)) single_forms_to_delete = [] for slot, formspec in forms_to_delete: for single_form in formspec.split(","): single_forms_to_delete.append((slot, single_form)) for formind, (slot, formval) in blib.iter_items(single_forms_to_delete, get_name=lambda x: x[1]): partpos = None if slot == "pres_actv_ptc": partpos = "presactpart" elif slot in ["perf_actv_ptc", "perf_pasv_ptc"]: partpos = "perfpasspart" elif slot == "futr_actv_ptc": partpos = "futactpart" elif slot == "futr_pasv_ptc": partpos = "futpasspart" if partpos: delete_participle(index, lemma, formind, formval, partpos, preserve_diaeresis, save, verbose, diff) else: if pos == "noun": posform = "nounform" elif pos == "verb": posform = "verbform" elif pos == "adj": posform = "adjform" elif pos == "nounadj": # Noun that uses an adjective declension posform = "nounform" elif pos == "numadj": posform = "numform" elif pos == "part": posform = "partform" else: raise ValueError("Invalid part of speech %s" % pos) delete_form(index, lemma, formind, formval, posform, True if slot is None else tag_sets_to_delete, preserve_diaeresis, save, verbose, diff)