def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found ru-noun+ or ru-proper noun+, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, [] noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] frobbed_manual_translit = [] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) # Retrieve headword translit and maybe transfer to decl headword_tr = getparam(headword_template, "tr") if headword_tr: if verbose: pagemsg("Found headword manual translit tr=%s" % headword_tr) if "," in headword_tr: pagemsg("WARNING: Comma in headword manual translit, skipping: %s" % headword_tr) return None # Punt if multi-arg-set, can't handle yet for decl_template in decl_templates: for param in decl_template.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template)) return None if val == "-" or val == "_" or val.startswith("join:"): pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template)) return None for i in xrange(2, 10): if getparam(headword_template, "tr%s" % i): pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % ( i, unicode(headword_template))) return None if runoun.arg1_is_stress(getparam(decl_template, "1")): lemma_arg = "2" else: lemma_arg = "1" lemmaval = getparam(decl_template, lemma_arg) if not lemmaval: lemmaval = subpagetitle if "//" in lemmaval: m = re.search("^(.*?)//(.*)$", lemmaval) if m.group(2) != headword_tr: pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % ( lemmaval, headword_tr)) return None else: pagemsg("Already found manual translit in decl template %s" % lemmaval) else: lemmaval += "//" + headword_tr orig_decl_template = unicode(decl_template) decl_template.add(lemma_arg, lemmaval) pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) frobbed_manual_translit = [headword_tr] genders = blib.fetch_param_chain(headword_template, "2", "g") bian_replaced = 0 # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in # headword template for decl_template in decl_templates: if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]: saw_in = -1 saw_an = -1 for i,g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0: orig_decl_template = unicode(decl_template) if saw_in < saw_an: pagemsg("Replacing a=bi with a=ia in decl template") decl_template.add("a", "ia") bian_replaced = 1 else: pagemsg("Replacing a=bi with a=ai in decl template") decl_template.add("a", "ai") bian_replaced = 1 pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) genders = runoun.check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg) if genders == None: return None new_params = [] for param in noun_table_template.params: new_params.append((param.name, param.value)) orig_headword_template = unicode(headword_template) params_to_preserve = runoun.fix_old_headword_params(headword_template, new_params, genders, pagemsg) if params_to_preserve == None: return None if unicode(headword_template.name) == "ru-proper noun": # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(headword_template, "n"): pagemsg("Adding n=both to headword tempate") headword_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) else: pagemsg("Removing n=sg from headword tempate") rmparam(headword_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) headword_template.params.extend(params_to_preserve) ru_noun_changed = 0 ru_proper_noun_changed = 0 if unicode(headword_template.name) == "ru-noun": headword_template.name = "ru-noun+" ru_noun_changed = 1 else: headword_template.name = "ru-proper noun+" ru_proper_noun_changed = 1 pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template))) return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if rulib.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg( "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg( "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg( "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg( "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg( "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runounlib.convert_zdecl_to_ru_noun_table( decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-" ]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg( "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search( u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get( lemma, None) if not overriding_decl: pagemsg( "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg( "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = rulib.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub( sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg( "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or rulib.is_monosyllabic(infl) and rulib.remove_accents(arglemma).lower() == rulib.remove_accents(infl).lower() or ispl and rulib.remove_accents(arglemma).lower() == lemma.lower()): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg( "WARNING: Can't sub word link %s into decl lemma %s%s" % (wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg( u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg( u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg( u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg( u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg( "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg( "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg( "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg( "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None else: pagemsg( "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend( (str(i + 1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg( "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg( "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) else: pagemsg( "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg( "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg( "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg( u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = ru.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() == ru.remove_accents(infl).lower() or ispl and ru.remove_accents(arglemma).lower() == lemma.lower() ): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % ( wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None else: pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) else: pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if not noun_table_templates and not noun_old_templates: return unicode(parsed), 0, 0, 0, [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found ru-noun+ or ru-proper noun+, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, [] noun_table_template = noun_table_templates[0] if len( noun_table_templates) == 1 else None noun_old_template = noun_old_templates[0] if len( noun_old_templates) == 1 else None if noun_old_template and not noun_table_template: noun_table_template = noun_old_template noun_old_template = None headword_template = headword_templates[0] frobbed_manual_translit = [] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) # Retrieve headword translit and maybe transfer to decl headword_tr = getparam(headword_template, "tr") if headword_tr: if verbose: pagemsg("Found headword manual translit tr=%s" % headword_tr) if "," in headword_tr: pagemsg( "WARNING: Comma in headword manual translit, skipping: %s" % headword_tr) return None # Punt if multi-arg-set, can't handle yet for decl_template in decl_templates: for param in decl_template.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg( "WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template)) return None if val == "-" or val == "_" or val.startswith("join:"): pagemsg( "WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template)) return None for i in xrange(2, 10): if getparam(headword_template, "tr%s" % i): pagemsg( "WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (i, unicode(headword_template))) return None if runounlib.arg1_is_stress(getparam(decl_template, "1")): lemma_arg = "2" else: lemma_arg = "1" lemmaval = getparam(decl_template, lemma_arg) if not lemmaval: lemmaval = subpagetitle if "//" in lemmaval: m = re.search("^(.*?)//(.*)$", lemmaval) if m.group(2) != headword_tr: pagemsg( "WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (lemmaval, headword_tr)) return None else: pagemsg( "Already found manual translit in decl template %s" % lemmaval) else: lemmaval += "//" + headword_tr orig_decl_template = unicode(decl_template) decl_template.add(lemma_arg, lemmaval) pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) frobbed_manual_translit = [headword_tr] genders = blib.fetch_param_chain(headword_template, "2", "g") bian_replaced = 0 # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in # headword template for decl_template in decl_templates: if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]: saw_in = -1 saw_an = -1 for i, g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0: orig_decl_template = unicode(decl_template) if saw_in < saw_an: pagemsg("Replacing a=bi with a=ia in decl template") decl_template.add("a", "ia") bian_replaced = 1 else: pagemsg("Replacing a=bi with a=ai in decl template") decl_template.add("a", "ai") bian_replaced = 1 pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) generate_template = re.sub( r"^\{\{ru-noun-old", "{{ru-generate-noun-args|old=1", re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template))) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = blib.split_generate_args(generate_result) genders = runounlib.check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg) if genders == None: return None new_params = [] for param in noun_table_template.params: new_params.append((param.name, param.value)) orig_headword_template = unicode(headword_template) params_to_preserve = runounlib.fix_old_headword_params( headword_template, new_params, genders, pagemsg) if params_to_preserve == None: return None if unicode(headword_template.name) == "ru-proper noun": # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(headword_template, "n"): pagemsg("Adding n=both to headword tempate") headword_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace( "}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = blib.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg( "WARNING: Something wrong: Found n=%s, not singular" % existing_n) else: pagemsg("Removing n=sg from headword tempate") rmparam(headword_template, "n") else: pagemsg( "WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) headword_template.params.extend(params_to_preserve) ru_noun_changed = 0 ru_proper_noun_changed = 0 if unicode(headword_template.name) == "ru-noun": headword_template.name = "ru-noun+" ru_noun_changed = 1 else: headword_template.name = "ru-proper noun+" ru_proper_noun_changed = 1 if unicode(noun_table_template).startswith("{{ru-noun-old"): headword_template.add("old", "1") pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template))) return unicode( parsed ), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit