corverb for corverb in corverbs if corverb.startswith("*")) if isrefl: refverb = re.sub(u"с[ья]$", "", verb) + gender_arg correfverbs = [] for corverb in corverbs: correfverbs.append( "%s|g=%s" % (re.sub(u"с[ья]$", "", re.sub(r"^\*", "", corverb)), "impf" if headword_aspect == "pf" or corverb.startswith("*") else "pf")) else: refverb = (re.search(u"и́?$", verb) and verb + u"сь" or rulib.try_to_stress(verb) + u"ся") + gender_arg correfverbs = [] for corverb in corverbs: impf_override = corverb.startswith("*") corverb = re.sub(r"^\*", "", corverb) correfverbs.append("%s|g=%s" % ( (re.search(u"и́?$", corverb) and corverb + u"сь" or rulib.try_to_stress(corverb) + u"ся"), "impf" if headword_aspect == "pf" or impf_override else "pf")) if headword_aspect == "pf" or corverb_impf_override: refverbs = correfverbs + [refverb] else: refverbs = [refverb] + correfverbs
def try_to_stress(form): if "//" in form: m = re.search("^(.*?)//(.*)$", form) # FIXME: This should stress the translit as well return rulib.try_to_stress(m.group(1)) + "//" + m.group(2) return rulib.try_to_stress(form)
def convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=None): zdecl = unicode(decl_z_template) zdeclcopy = blib.parse_text(zdecl).filter_templates()[0] decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} def getp(param): rmparam(zdeclcopy, param) return getparam(decl_z_template, param).strip() zlemma = getp("1") zgender_anim = getp("2") zstress = getp("3") zspecial = re.sub(u"ё", u";ё", getp("4")) m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim) if not m: pagemsg("WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s" % zgender_anim) return None zgender, zanim = m.groups() if not zlemma: pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl) return None # Remove unnecessary gender need_gender = (re.search(u"[иы]́?$", zlemma) or zgender == "n" and re.search(u"[яа]́?$", zlemma) or zgender == "m" and re.search(u"[яа]́?$", zlemma) and "(1)" in zspecial or zlemma.endswith(u"ь")) if not need_gender: normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n" or re.search(u"[ая]́?$", zlemma) and "f" or "m") if normal_gender != zgender: pagemsg("WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender" % (normal_gender, zgender)) need_gender = True if need_gender: pagemsg("Preserving gender in z-decl: %s" % zdecl) zspecial = zgender + zspecial else: pagemsg("Not preserving gender in z-decl: %s" % zdecl) # Remove unnecessary stress stressed_lemma = ru.try_to_stress(zlemma) def check_defstress(defstr, reason): if defstr == zstress: pagemsg("Removing stress %s as default because %s: stressed_lemma=%s, template=%s" % (defstr, reason, stressed_lemma, zdecl)) return defstr if ru.is_nonsyllabic(stressed_lemma): default_stress = check_defstress("b", "nonsyllabic lemma") elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma): default_stress = check_defstress("b", "ending-accented lemma") # No need for special-casing for ёнок or а́нин, as they are considered # accent a by ru-decl-noun-z else: default_stress = check_defstress("a", "stem-accented lemma") if default_stress == zstress: zstress = "" else: pagemsg("Not removing stress %s: %s" % (zstress, zdecl)) # Remove unnecessary lemma if ru.try_to_stress(subpagetitle) == stressed_lemma: pagemsg(u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s" % (zlemma, subpagetitle, zdecl)) zlemma = "" if zstress: decl_template.add("1", zstress) offset = 1 else: offset = 0 decl_template.add(str(1 + offset), zlemma) decl_template.add(str(2 + offset), zspecial) if not getparam(decl_template, "3"): rmparam(decl_template, "3") if not getparam(decl_template, "2"): rmparam(decl_template, "2") if not getparam(decl_template, "1"): rmparam(decl_template, "1") headword_anim_spec = headword_template and extract_headword_anim_spec(headword_template) def anim_mismatch(zdecl_an, allowed_headword_ans): if headword_anim_spec and headword_anim_spec not in allowed_headword_ans: pagemsg("WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s" % (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans), zdecl, unicode(headword_template))) if zanim == "an": anim_mismatch(zanim, ["an"]) pagemsg("Preserving z-decl -an as a=an: %s" % zdecl) decl_template.add("a", "an") elif zanim == "inan": anim_mismatch(zanim, ["ai", "ia"]) if headword_anim_spec in ["ai", "ia"]: pagemsg("Converting z-decl -inan to a=%s: %s" % (headword_anim_spec, zdecl)) decl_template.add("a", headword_anim_spec) else: pagemsg("WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s" % (zdecl, unicode(headword_template or "(no headword)"))) decl_template.add("a", "bi") else: assert(zanim == "in") anim_mismatch(zanim, ["in"]) pagemsg("Dropping z-decl -in as default: %s" % zdecl) znum = getp("n") if znum: if znum == "pl": pagemsg("WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s" % zdecl) pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl)) decl_template.add("n", znum) preserve_params = [ 'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg', 'nom_pl', 'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl', 'voc' ] renamed_params = {'prp_sg':'pre_sg', 'prp_pl':'pre_pl'} for param in preserve_params: val = getp(param) if not val: continue newval = fixup_link(val) newvals = re.split(r"\s*,\s*", newval) newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals] newval= ",".join(newvals) newparam = renamed_params.get(param, param) pagemsg("Preserving z-decl override %s=%s%s%s: %s" % ( newparam, newval, "" if newparam == param else "; renamed from %s" % param, "" if newval == val else "; canonicalized from %s=%s" % (param, val), zdecl)) decl_template.add(newparam, newval) loc = getp("loc") if loc: if loc == u"в": newloc = u"в +" elif loc == u"на": newloc = u"на +" else: newloc = u"в/на +" pagemsg("Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s" % (newloc, loc, zdecl)) decl_template.add("loc", newloc) par = getp("par") if par: newpar="+" pagemsg("Preserving z-decl partitive par=%s (canonicalized from par=%s): %s" % (newpar, par, zdecl)) decl_template.add('par', newpar) notes = getp("note") if notes: pagemsg("WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s" % (notes, zdecl)) decl_template.add('notes', notes) if zdeclcopy.params: pagemsg("WARNING: Extraneous params in z-decl: %s" % unicode(zdeclcopy)) #pagemsg("Replacing z-decl %s with regular decl %s" % # (zdecl, unicode(decl_template))) return decl_template
def convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=None): zdecl = unicode(decl_z_template) zdeclcopy = blib.parse_text(zdecl).filter_templates()[0] decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} def getp(param): rmparam(zdeclcopy, param) return getparam(decl_z_template, param).strip() zlemma = getp("1") zgender_anim = getp("2") zstress = getp("3") zspecial = re.sub(u"ё", u";ё", getp("4")) m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim) if not m: pagemsg( "WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s" % zgender_anim) return None zgender, zanim = m.groups() if not zlemma: pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl) return None # Remove unnecessary gender need_gender = (re.search(u"[иы]́?$", zlemma) or zgender == "n" and re.search(u"[яа]́?$", zlemma) or zgender == "m" and re.search(u"[яа]́?$", zlemma) and "(1)" in zspecial or zlemma.endswith(u"ь")) if not need_gender: normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n" or re.search(u"[ая]́?$", zlemma) and "f" or "m") if normal_gender != zgender: pagemsg( "WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender" % (normal_gender, zgender)) need_gender = True if need_gender: pagemsg("Preserving gender in z-decl: %s" % zdecl) zspecial = zgender + zspecial else: pagemsg("Not preserving gender in z-decl: %s" % zdecl) # Remove unnecessary stress stressed_lemma = rulib.try_to_stress(zlemma) def check_defstress(defstr, reason): if defstr == zstress: pagemsg( "Removing stress %s as default because %s: stressed_lemma=%s, template=%s" % (defstr, reason, stressed_lemma, zdecl)) return defstr if rulib.is_nonsyllabic(stressed_lemma): default_stress = check_defstress("b", "nonsyllabic lemma") elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma): default_stress = check_defstress("b", "ending-accented lemma") # No need for special-casing for ёнок or а́нин, as they are considered # accent a by ru-decl-noun-z else: default_stress = check_defstress("a", "stem-accented lemma") if default_stress == zstress: zstress = "" else: pagemsg("Not removing stress %s: %s" % (zstress, zdecl)) # Remove unnecessary lemma if rulib.try_to_stress(subpagetitle) == stressed_lemma: pagemsg( u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s" % (zlemma, subpagetitle, zdecl)) zlemma = "" if zstress: decl_template.add("1", zstress) offset = 1 else: offset = 0 decl_template.add(str(1 + offset), zlemma) decl_template.add(str(2 + offset), zspecial) if not getparam(decl_template, "3"): rmparam(decl_template, "3") if not getparam(decl_template, "2"): rmparam(decl_template, "2") if not getparam(decl_template, "1"): rmparam(decl_template, "1") headword_anim_spec = headword_template and extract_headword_anim_spec( headword_template) def anim_mismatch(zdecl_an, allowed_headword_ans): if headword_anim_spec and headword_anim_spec not in allowed_headword_ans: pagemsg( "WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s" % (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans), zdecl, unicode(headword_template))) if zanim == "an": anim_mismatch(zanim, ["an"]) pagemsg("Preserving z-decl -an as a=an: %s" % zdecl) decl_template.add("a", "an") elif zanim == "inan": anim_mismatch(zanim, ["ai", "ia"]) if headword_anim_spec in ["ai", "ia"]: pagemsg("Converting z-decl -inan to a=%s: %s" % (headword_anim_spec, zdecl)) decl_template.add("a", headword_anim_spec) else: pagemsg( "WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s" % (zdecl, unicode(headword_template or "(no headword)"))) decl_template.add("a", "bi") else: assert (zanim == "in") anim_mismatch(zanim, ["in"]) pagemsg("Dropping z-decl -in as default: %s" % zdecl) znum = getp("n") if znum: if znum == "pl": pagemsg( "WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s" % zdecl) pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl)) decl_template.add("n", znum) preserve_params = [ 'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg', 'nom_pl', 'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl', 'voc' ] renamed_params = {'prp_sg': 'pre_sg', 'prp_pl': 'pre_pl'} for param in preserve_params: val = getp(param) if not val: continue newval = fixup_link(val) newvals = re.split(r"\s*,\s*", newval) newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals] newval = ",".join(newvals) newparam = renamed_params.get(param, param) pagemsg("Preserving z-decl override %s=%s%s%s: %s" % (newparam, newval, "" if newparam == param else "; renamed from %s" % param, "" if newval == val else "; canonicalized from %s=%s" % (param, val), zdecl)) decl_template.add(newparam, newval) loc = getp("loc") if loc: if loc == u"в": newloc = u"в +" elif loc == u"на": newloc = u"на +" else: newloc = u"в/на +" pagemsg( "Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s" % (newloc, loc, zdecl)) decl_template.add("loc", newloc) par = getp("par") if par: newpar = "+" pagemsg( "Preserving z-decl partitive par=%s (canonicalized from par=%s): %s" % (newpar, par, zdecl)) decl_template.add('par', newpar) notes = getp("note") if notes: pagemsg( "WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s" % (notes, zdecl)) decl_template.add('notes', notes) if zdeclcopy.params: pagemsg("WARNING: Extraneous params in z-decl: %s" % unicode(zdeclcopy)) #pagemsg("Replacing z-decl %s with regular decl %s" % # (zdecl, unicode(decl_template))) return decl_template
def try_to_stress(form): if "//" in form: m = re.search("^(.*?)//(.*)$", form) # FIXME: This should stress the translit as well return ru.try_to_stress(m.group(1)) + "//" + m.group(2) return ru.try_to_stress(form)