def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "#invoke:form of/templates" and getparam(
                t, "1") == "template_tags":
            t.add("1", "tagged_form_of_t")
            notes.append(
                "Rewrite {{#invoke:form of/templates|template_tags}} with {{#invoke:form of/templates|tagged_form_of_t}}"
            )
        if tn == "#invoke:form of" and getparam(
                t, "1") in ["form_of_t", "alt_form_of_t"]:
            ignorelist = blib.fetch_param_chain(t, "ignorelist", "ignorelist")
            if ignorelist:
                ignore = blib.fetch_param_chain(t, "ignore", "ignore")
                for il in ignorelist:
                    ignore.append(il + ":list")
                blib.set_param_chain(t,
                                     ignore,
                                     "ignore",
                                     "ignore",
                                     before="ignorelist")
                blib.remove_param_chain(t, "ignorelist", "ignorelist")
            blib.set_template_name(t, "#invoke:form of/templates")
            notes.append(
                "Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}"
                % getparam(t, "1"))
        if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t":
            t.add("2", getparam(t, "text"), before="text")
            rmparam(t, "text")
            if t.has("nocap"):
                rmparam(t, "nocap")
            else:
                t.add("withcap", "1")
            if t.has("nodot"):
                rmparam(t, "nodot")
            else:
                t.add("withdot", "1")
            t.add("1", "form_of_t")

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Exemplo n.º 2
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  head = None
  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn == "uk-noun":
      gen = blib.fetch_param_chain(t, "3", "gen")
      if len(gen) == 1 and gen[0].endswith(u"і"):
        gen2 = gen[0][0:-1] + u"и"
        t.add("gen2", gen2, before="4")
    elif tn in ["uk-decl-noun", "uk-decl-noun-unc", "uk-decl-noun-pl"]:
      gensparam = 3 if tn == "uk-decl-noun" else 2
      gens = getparam(t, str(gensparam))
      if "," not in gens and gens.endswith(u"і"):
        gens += ", " + gens[0:-1] + u"и"
        t.add(str(gensparam), gens)
    if origt != unicode(t):
      notes.append(u"add alternative genitive singular to Ukrainian nouns ending in -сть")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
Exemplo n.º 3
0
def get_head_param(t, pagetitle):
    tn = tname(t)
    if tn in [
            "ang-adj", "ang-adj-comp", "ang-adj-sup", "ang-adv",
            "ang-adv-comp", "ang-adv-sup", "ang-verb"
    ]:
        retval = blib.fetch_param_chain(t, "1", "head")
    elif tn in [
            "ang-noun", "ang-noun-form", "ang-verb-form", "ang-adj-form",
            "ang-con", "ang-prep", "ang-prefix", "ang-proper noun",
            "ang-suffix"
    ]:
        retval = blib.fetch_param_chain(t, "head", "head")
    elif tn == "head" and getparam(t, "1") == "ang":
        retval = blib.fetch_param_chain(t, "head", "head")
    else:
        return None
    return retval or [pagetitle]
Exemplo n.º 4
0
  def process_verb_headt(t):
    origt = unicode(t)
    def getp(param):
      return getparam(t, param)
    tr = getp("tr")
    if getp("2"):
      head = getp("1")
      g = getp("2")
    else:
      head = getp("head")
      g = getp("1") or getp("a")
    pf = blib.fetch_param_chain(t, "pf", "pf")
    impf = blib.fetch_param_chain(t, "impf", "impf")
    must_continue = False
    for param in t.params:
      pn = pname(param)
      if pn not in ["head", "tr", "1", "a", "2", "pf", "pf2", "pf3",
          "impf", "impf2", "impf3"]:
        pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
            (pn, unicode(param.value), origt))
        must_continue = True
        break
    if must_continue:
      return False
    del t.params[:]
    if not head:
      head = pagetitle
    if belib.needs_accents(head):
      pagemsg("WARNING: Head %s missing accents: %s" % (head, origt))
    if not g:
      pagemsg("WARNING: No aspect in verb headword: %s" % origt)
      g = "?"
    t.add("1", head)
    if tr:
      t.add("tr", tr)
    t.add("2", g)
    blib.set_param_chain(t, pf, "pf", "pf")
    blib.set_param_chain(t, impf, "impf", "impf")

    if origt != unicode(t):
      notes.append("fix up {{be-verb}} to use new param convention")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return True
def process_page(index, page, save, verbose):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    parsed = blib.parse(page)
    found_page_head = False
    for t in parsed.filter_templates():
        found_this_head = False
        if tname(t) in ru_normal_head_templates:
            heads = blib.fetch_param_chain(t, "1", "head")
            for head in heads:
                if has_secondary_stress(head):
                    pagemsg("Found secondarily stressed head %s in %s" %
                            (head, unicode(t)))
        elif tname(t) == "head" and getparam(t, "1") == "ru":
            heads = blib.fetch_param_chain(t, "head", "head")
            for head in heads:
                if has_secondary_stress(head):
                    pagemsg("Found secondarily stressed head %s in %s" %
                            (head, unicode(t)))
        elif tname(t) in [
                "ru-noun+", "ru-proper noun+", "ru-noun-table", "ru-noun-old"
        ]:
            per_word_objs = runounlib.split_noun_decl_arg_sets(t, pagemsg)
            for per_word in per_word_objs:
                for arg_set in per_word:
                    if has_secondary_stress(arg_set[1]):
                        pagemsg("Found secondarily stressed head %s in %s" %
                                (arg_set[1], unicode(t)))
        elif tname(t) == "ru-decl-adj":
            head = getparam(t, "1")
            if has_secondary_stress(head):
                pagemsg("Found secondarily stressed head %s in %s" %
                        (head, unicode(t)))
Exemplo n.º 6
0
def process_page(index, page, save, verbose, nouns):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    if not re.search(u"[иы]й$", pagetitle):
        pagemsg(u"Skipping adjective not in -ый or -ий")
        return

    noun = re.sub(u"[иы]й$", u"ость", pagetitle)
    if noun not in nouns:
        return

    text = unicode(page.text)
    parsed = blib.parse(page)

    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname == u"ru-adj-alt-ё":
            pagemsg(u"Skipping alt-ё adjective")
            return

    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname == "ru-adj":
            heads = blib.fetch_param_chain(t, "1", "head", pagetitle)
            if len(heads) > 1:
                pagemsg("Skipping adjective with multiple heads: %s" %
                        ",".join(heads))
                return
            tr = getparam(t, "tr")

            nounsection = blib.find_lang_section(noun, "Russian", pagemsg,
                                                 errandpagemsg)
            if not nounsection:
                pagemsg("Couldn't find Russian section for %s" % noun)
                continue
            if "==Etymology" in nounsection:
                pagemsg("Noun %s already has etymology" % noun)
                continue
            if tr:
                msg(u"%s %s+tr1=%s+-ость no-etym" % (noun, heads[0], tr))
            else:
                msg(u"%s %s+-ость no-etym" % (noun, heads[0]))
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    if unicode(t.name) == "R:vep:UVVV":
      refpages = blib.fetch_param_chain(t, "1", "")
      for refpage in refpages:
        if not pywikibot.Page(site, refpage).exists():
          pagemsg("Page [[%s]] does not exist" % refpage)
def process_page(page, index):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    if unicode(t.name) == "R:vep:UVVV":
      refpages = blib.fetch_param_chain(t, "1", "")
      for refpage in refpages:
        if not pywikibot.Page(site, refpage).exists():
          pagemsg("Page [[%s]] does not exist" % refpage)
def find_noun_lemmas(parsed, pagetitle, errandpagemsg, expand_text):
    noun_lemmas = []
    for t in parsed.filter_templates():
        if tname(t) in ["ru-noun+", "ru-proper noun+"]:
            lemmaarg = rulib.fetch_noun_lemma(t, expand_text)
            if lemmaarg is None:
                errandpagemsg("WARNING: Error generating noun forms: %s" %
                              unicode(t))
                return
            else:
                for lemma in re.split(",", lemmaarg):
                    add_if_not(noun_lemmas, lemma)
        elif tname(t) in ["ru-noun", "ru-proper noun"]:
            for lemma in blib.fetch_param_chain(t, "1", "head", pagetitle):
                add_if_not(noun_lemmas, lemma)
    return noun_lemmas
Exemplo n.º 10
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "ru-adj":
      comps = blib.fetch_param_chain(t, "2", "comp")
      newcomps = []
      for comp in comps:
        if re.search(u"е́?й$", comp):
          regcomp = re.sub(u"(е́?)й$", ur"\1е", comp)
          if regcomp in newcomps:
            pagemsg("Skipping informal form %s" % comp)
            notes.append("remove informal comparative %s" % comp)
          else:
            pagemsg("WARNING: Found informal form %s without corresponding regular form")
            newcomps.append(comp)
        else:
          newcomps.append(comp)
      if comps != newcomps:
        blib.set_param_chain(t, newcomps, "2", "comp")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Exemplo n.º 11
0
def extract_headword_anim_spec(headword_template):
  genders = blib.fetch_param_chain(headword_template, "2", "g")
  saw_in = -1
  saw_an = -1
  for i,g in enumerate(genders):
    if re.search(r"\bin\b", g) and saw_in < 0:
      saw_in = i
    if re.search(r"\ban\b", g) and saw_an < 0:
      saw_an = i
  if saw_in >= 0 and saw_an >= 0 and saw_in < saw_an:
    return "ia"
  elif saw_in >= 0 and saw_an >= 0:
    return "ai"
  elif saw_an >= 0:
    return "an"
  elif saw_in >= 0:
    return "in"
  else:
    return None
Exemplo n.º 12
0
def extract_headword_anim_spec(headword_template):
    genders = blib.fetch_param_chain(headword_template, "2", "g")
    saw_in = -1
    saw_an = -1
    for i, g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
            saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
            saw_an = i
    if saw_in >= 0 and saw_an >= 0 and saw_in < saw_an:
        return "ia"
    elif saw_in >= 0 and saw_an >= 0:
        return "ai"
    elif saw_an >= 0:
        return "an"
    elif saw_in >= 0:
        return "in"
    else:
        return None
Exemplo n.º 13
0
def find_adj(pagename, pagemsg, errandpagemsg, expand_text):
  section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg)
  if not section:
    return None
  if "==Etymology" in section:
    return -1
  parsed = blib.parse_text(section)
  adjs = []
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-adj":
      heads = blib.fetch_param_chain(t, "1", "head", pagename)
      if len(heads) > 1:
        pagemsg("WARNING: Multiple lemmas for adjective: %s" % ",".join(heads))
        return None
      if heads[0] not in adjs:
        adjs.append(heads[0])
  if len(adjs) > 1:
    pagemsg("WARNING: Multiple lemmas for adjective: %s" % ",".join(adjs))
  if not adjs:
    return None
  return adjs[0]
Exemplo n.º 14
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        if unicode(t.name) == "ru-adj":
            comps = blib.fetch_param_chain(t, "2", "comp")
            newcomps = []
            for comp in comps:
                if re.search(u"е́?й$", comp):
                    regcomp = re.sub(u"(е́?)й$", ur"\1е", comp)
                    if regcomp in newcomps:
                        pagemsg("Skipping informal form %s" % comp)
                        notes.append("remove informal comparative %s" % comp)
                    else:
                        pagemsg(
                            "WARNING: Found informal form %s without corresponding regular form"
                        )
                        newcomps.append(comp)
                else:
                    newcomps.append(comp)
            if comps != newcomps:
                blib.set_param_chain(t, newcomps, "2", "comp")
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Exemplo n.º 15
0
def check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg, laxer_comparison=False):
  # FORM1 is the forms from ru-noun (or ru-proper noun); FORM2 is the combined
  # set of forms from ru-noun-table, and needs to be split on commas.
  # FORM1_LEMMA is true if the FORM1 values come from the ru-noun lemma.
  def compare_forms(case, form1, form2, form1_lemma=False):
    # Split on individual words and allow monosyllabic accent differences.
    # FIXME: Will still have problems with [[X|Y]].
    def compare_single_form(f1, f2):
      words1 = re.split("[ -]", f1)
      words2 = re.split("[ -]", f2)
      if len(words1) != len(words2):
        return None
      for i in xrange(len(words1)):
        if words1[i] != words2[i]:
          w1 = fixup_link(words1[i])
          w2 = words2[i]
          # Allow case where existing is monosyllabic and missing a stress
          # compared with proposed
          w1 = {w1, try_to_stress(w1)}
          # Allow case where existing is missing a link as compared to
          # proposed (but not other way around; we don't want a link
          # disappearing)
          w2 = {w2, blib.remove_links(w2)}
          if not (w1 & w2):
            return None
      return True
    form1 = [fixup_link(re.sub(u"ё́", u"ё", x)) for x in form1]
    form2 = re.split(",", form2)
    if laxer_comparison or not form1_lemma:
      # Ignore manual translit in decl forms when comparing non-lemma forms;
      # not available from ru-noun (and not displayed anyway); also when
      # laxer_comparison is set, which happens in add_noun_decl
      form2 = [re.sub("//.*$", "", x) for x in form2]
    # If existing value missing, OK; also allow for unstressed monosyllabic
    # existing form matching stressed monosyllabic new form
    if form1:
      if (set(form1) == set(form2) or
          set(try_to_stress(x) for x in form1) == set(form2) or
          len(form1) == 1 and len(form2) == 1 and compare_single_form(form1[0], form2[0])):
        pass
      else:
        pagemsg("WARNING: case %s, existing forms %s not same as proposed %s" %(
            case, ",".join(form1), ",".join(form2)))
        return None
    return True

  def compare_genders(g1, g2):
    if set(g1) == set(g2):
      return True
    if len(g1) == 1 and len(g2) == 1:
      # If genders don't match exactly, check if existing gender is missing
      # animacy and allow that, so it gets overwritten with new gender
      if g1[0] == re.sub("-(an|in)", "", g2[0]):
        pagemsg("Existing gender %s missing animacy spec compared with proposed %s, allowed" % (
          ",".join(g1), ",".join(g2)))
        return True
    return None

  headwords = blib.fetch_param_chain(headword_template, "1", "head", subpagetitle)
  translits = blib.fetch_param_chain(headword_template, "tr", "tr")
  for i in xrange(len(translits)):
    if len(headwords) <= i:
      pagemsg("WARNING: Not enough headwords for translit tr%s=%s, skipping" % (
        "" if i == 0 else str(i+1), translits[i]))
      return None
    else:
      headwords[i] += "//" + translits[i]
  genitives = blib.fetch_param_chain(headword_template, "3", "gen")
  plurals = blib.fetch_param_chain(headword_template, "4", "pl")
  genders = blib.fetch_param_chain(headword_template, "2", "g")
  cases_to_check = None
  if args["n"] == "s":
    if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or
        not compare_forms("gen_sg", genitives, args["gen_sg"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_sg", "gen_sg"]
  elif args["n"] == "p":
    if (not compare_forms("nom_pl", headwords, args["nom_pl_linked"], True) or
        not compare_forms("gen_pl", genitives, args["gen_pl"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_pl", "gen_pl"]
  elif args["n"] == "b":
    if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or
        not compare_forms("gen_sg", genitives, args["gen_sg"]) or
        not compare_forms("nom_pl", plurals, args["nom_pl"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_sg", "gen_sg", "nom_pl"]
  else:
    pagemsg("WARNING: Unrecognized number spec %s, skipping" % args["n"])
    return None

  for case in cases_to_check:
    raw_case = re.sub(u"△", "", blib.remove_links(args[case + "_raw"]))
    if args[case] != raw_case:
      pagemsg("WARNING: Raw case %s=%s contains footnote symbol" % (
        case, args[case + "_raw"]))

  proposed_genders = re.split(",", args["g"])
  if compare_genders(genders, proposed_genders):
    genders = []
  else:
    # Check for animacy mismatch, punt if so
    cur_in = [x for x in genders if re.search(r"\bin\b", x)]
    cur_an = [x for x in genders if re.search(r"\ban\b", x)]
    proposed_in = [x for x in proposed_genders if re.search(r"\bin\b", x)]
    proposed_an = [x for x in proposed_genders if re.search(r"\ban\b", x)]
    if (cur_in or not cur_an) and proposed_an or (cur_an or not cur_in) and proposed_in:
      pagemsg("WARNING: Animacy mismatch, skipping: cur=%s proposed=%s" % (
        ",".join(genders), ",".join(proposed_genders)))
      return None
    # Check for number mismatch, punt if so
    cur_pl = [x for x in genders if re.search(r"\bp\b", x)]
    if cur_pl and args["n"] != "p" or not cur_pl and args["n"] == "p":
      pagemsg("WARNING: Number mismatch, skipping: cur=%s, proposed=%s, n=%s" % (
        ",".join(genders), ",".join(proposed_genders), args["n"]))
      return None
    pagemsg("WARNING: Gender mismatch, existing=%s, new=%s" % (
      ",".join(genders), ",".join(proposed_genders)))

  return genders
Exemplo n.º 16
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    notes = []

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M)

    saw_a_template = False

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        la_verb_template = None
        la_conj_template = None
        must_continue = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "la-conj":
                if la_conj_template:
                    pagemsg(
                        "WARNING: Saw multiple verb conjugation templates in subsection, %s and %s, skipping"
                        % (unicode(la_conj_template), unicode(t)))
                    must_continue = True
                    break
                la_conj_template = t
                saw_a_template = True
            if tn == "la-verb":
                if la_verb_template:
                    pagemsg(
                        "WARNING: Saw multiple verb headword templates in subsection, %s and %s, skipping"
                        % (unicode(la_verb_template), unicode(t)))
                    must_continue = True
                    break
                la_verb_template = t
                saw_a_template = True
        if must_continue:
            continue
        if not la_verb_template and not la_conj_template:
            continue
        if la_verb_template and not la_conj_template:
            pagemsg(
                "WARNING: Saw verb headword template but no conjugation template: %s"
                % unicode(la_verb_template))
            continue
        if la_conj_template and not la_verb_template:
            pagemsg(
                "WARNING: Saw verb conjugation template but no headword template: %s"
                % unicode(la_conj_template))
            continue

        orig_la_verb_template = unicode(la_verb_template)
        if re.search(r"^(irreg|[0-9]\+*)(\..*)?$",
                     getparam(la_verb_template, "1")):
            pagemsg("Found new-style verb headword template, skipping: %s" %
                    orig_la_verb_template)
            continue

        def render_headword_and_conj():
            return "headword template <from> %s <to> %s <end>, conjugation template <from> %s <to> %s <end>" % (
                orig_la_verb_template, orig_la_verb_template,
                unicode(la_conj_template), unicode(la_conj_template))

        verb_props = new_generate_verb_forms(unicode(la_conj_template),
                                             errandpagemsg,
                                             expand_text,
                                             include_props=True)
        if verb_props is None:
            continue
        subtypes = [
            x.replace("-", "") for x in safe_split(verb_props["subtypes"], ".")
        ]
        conj_type = verb_props["conj_type"]
        conj_subtype = verb_props.get("conj_subtype", None)

        def compare_headword_conj_forms(id_slot,
                                        headword_forms,
                                        conj_slots,
                                        adjust_for_missing_perf_forms=False,
                                        remove_conj_links=False):
            conj_forms = ""
            for slot in conj_slots:
                if slot in verb_props:
                    conj_forms = verb_props[slot]
                    break
            conj_forms = safe_split(conj_forms, ",")
            if remove_conj_links:
                conj_forms = [blib.remove_links(x) for x in conj_forms]
            corrected_headword_forms = [
                lengthen_ns_nf(x) for x in headword_forms
            ]
            corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms]
            if adjust_for_missing_perf_forms:
                # There are several instances of 4++ verbs where only the -īvī variant,
                # not the -iī variant, is listed in the headword. Don't get tripped up
                # by that.
                ivi_conj_forms = [
                    x for x in corrected_conj_forms if x.endswith(u"īvī")
                ]
                for ivi_conj_form in ivi_conj_forms:
                    ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form)
                    if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms:
                        corrected_headword_forms.append(ii_conj_form)
            if set(corrected_headword_forms) != set(corrected_conj_forms):
                macronless_headword_forms = set(
                    lalib.remove_macrons(x) for x in corrected_headword_forms)
                macronless_conj_forms = set(
                    lalib.remove_macrons(x) for x in corrected_conj_forms)
                if macronless_headword_forms == macronless_conj_forms:
                    pagemsg(
                        "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s"
                        % (id_slot, ",".join(headword_forms), id_slot,
                           ",".join(conj_forms), render_headword_and_conj()))
                else:
                    pagemsg(
                        "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s"
                        % (id_slot, ",".join(headword_forms), id_slot,
                           ",".join(conj_forms), render_headword_and_conj()))
                return False
            return True

        verb_conj = getparam(la_verb_template, "conj") or getparam(
            la_verb_template, "c")
        pattern = getparam(la_verb_template, "pattern")
        lemma = blib.fetch_param_chain(la_verb_template,
                                       ["1", "head", "head1"], "head")
        inf = blib.fetch_param_chain(la_verb_template, ["2", "inf", "inf1"],
                                     "inf")
        perf = blib.fetch_param_chain(la_verb_template, ["3", "perf", "perf1"],
                                      "perf")
        sup = blib.fetch_param_chain(la_verb_template, ["4", "sup", "sup1"],
                                     "sup")
        # Hack to handle cases like abeō where the headword normally lists perfect
        # abiī but the conj lists abiī, abīvī.
        if verb_conj == "irreg" and len(lemma) > 0 and lemma[0].endswith(
                u"eō"):
            ivi = re.sub(u"eō$", u"īvī", lemma[0])
            if ivi not in perf:
                perf.append(ivi)
        if not compare_headword_conj_forms("lemma", lemma, [
                "1s_pres_actv_indc", "3s_pres_actv_indc", "1s_perf_actv_indc",
                "3s_perf_actv_indc"
        ]):
            continue
        if "depon" in subtypes or "semidepon" in subtypes:
            if sup:
                pagemsg(
                    "WARNING: Saw supine in conjunction with deponent verb, skipping: %s"
                    % render_headword_and_conj())
                continue
            sup = [re.sub("[sm]( (sum|est))?$", "m", x) for x in perf]
        else:
            if not compare_headword_conj_forms(
                    "perfect",
                    perf,
                ["1s_perf_actv_indc", "3s_perf_actv_indc"],
                    adjust_for_missing_perf_forms=True,
                    # Remove links from perfect to handle cases like adsoleō where the
                    # perfect is adsoluī,[[adsolitus]] [[sum]] and the headword says
                    # adsoluī,adsolitus sum.
                    remove_conj_links=True):
                continue
        if len(sup) > 0 and sup[0].endswith(u"ūrus"):
            if not compare_headword_conj_forms("future participle", sup,
                                               ["futr_actv_ptc"]):
                continue
            if "supfutractvonly" not in subtypes:
                if len(lemma) > 0 and lemma[0].endswith("sum"):
                    pass
                else:
                    pagemsg(
                        "WARNING: Expected supfutractvonly in subtypes=%s, skipping: %s"
                        % (".".join(
                            sorted(subtypes)), render_headword_and_conj()))
                    continue
        else:
            if not compare_headword_conj_forms("supine", sup, ["sup_acc"]):
                continue
        if not verb_conj:
            pagemsg("WARNING: No conj in headword template: %s" %
                    render_headword_and_conj())
        else:
            conj_type_to_verb_conj = {
                "1st": "1",
                "2nd": "2",
                "3rd": "3",
                "3rd-io": "io",
                "4th": "4",
                "irreg": "irreg",
            }
            if conj_type not in conj_type_to_verb_conj:
                pagemsg(
                    "WARNING: Something wrong, saw unrecognized conj_type=%s: %s"
                    % (conj_type, render_headword_and_conj()))
                continue
            conj_type = conj_type_to_verb_conj[conj_type]
            if conj_subtype:
                if conj_subtype not in conj_type_to_verb_conj:
                    pagemsg(
                        "WARNING: Something wrong, saw unrecognized conj_subtype=%s"
                        % (conj_subtype, render_headword_and_conj()))
                    continue
                conj_subtype = conj_type_to_verb_conj[conj_subtype]
            if verb_conj != conj_type and verb_conj != conj_subtype:
                pagemsg(
                    "WARNING: Conjugation template has conj=%s, subconj=%s but headword template has conj=%s, skipping: %s"
                    % (conj_type, conj_subtype, verb_conj,
                       render_headword_and_conj()))
                continue
        pattern = pattern.replace("opt-semi-depon", "optsemidepon")
        pattern = pattern.replace("semi-depon", "semidepon")
        pattern = pattern.replace("pass-3only", "pass3only")
        pattern = pattern.replace("pass-impers", "passimpers")
        pattern = pattern.replace("no-actv-perf", "noactvperf")
        pattern = pattern.replace("no-pasv-perf", "nopasvperf")
        pattern = pattern.replace("perf-as-pres", "perfaspres")
        pattern = pattern.replace("short-imp", "shortimp")
        pattern = pattern.replace("sup-futr-actv-only", "supfutractvonly")
        pattern = safe_split(pattern, "-")
        pattern = [
            x for x in pattern if x not in
            ["noperf", "nosup", "irreg", "def", "facio", "shortimp", "depon"]
        ]
        subtypes = [
            x for x in subtypes
            if x not in ["I", "noperf", "nosup", "irreg", "depon"]
        ]
        if len(lemma) > 0 and lemma[0].endswith("sum"):
            # This is added automatically by [[sum]]
            subtypes = [x for x in subtypes if x != "supfutractvonly"]
        if set(pattern) != set(subtypes):
            if set(subtypes) >= set(pattern) and (
                    set(subtypes) - set(pattern) <= {
                        "nopass", "p3inf", "poetsyncperf", "optsyncperf",
                        "alwayssyncperf"
                    }):
                pagemsg(
                    "Subtypes=%s of conjugation template have extra, ignorable subtypes %s compared with pattern=%s of headword template: %s"
                    % (".".join(sorted(subtypes)), ".".join(
                        sorted(list(set(subtypes) - set(pattern)))), ".".join(
                            sorted(pattern)), render_headword_and_conj()))
            else:
                pagemsg(
                    "WARNING: Conjugation template has subtypes=%s but headword template has pattern=%s, skipping: %s"
                    % (".".join(sorted(subtypes)), ".".join(
                        sorted(pattern)), render_headword_and_conj()))
                continue

        # Fetch remaining params from headword template
        headword_params = []
        for param in la_verb_template.params:
            pname = unicode(param.name)
            if pname.strip() in [
                    "1", "2", "3", "4", "44", "conj", "c", "pattern"
            ] or re.search("^(head|inf|perf|sup)[0-9]*$", pname.strip()):
                continue
            headword_params.append((pname, param.value, param.showkey))
        # Erase all params
        del la_verb_template.params[:]
        # Copy params from conj template
        for param in la_conj_template.params:
            pname = unicode(param.name)
            la_verb_template.add(pname,
                                 param.value,
                                 showkey=param.showkey,
                                 preserve_spacing=False)
        # Copy remaining params from headword template
        for name, value, showkey in headword_params:
            la_verb_template.add(name,
                                 value,
                                 showkey=showkey,
                                 preserve_spacing=False)
        pagemsg("Replaced %s with %s" %
                (orig_la_verb_template, unicode(la_verb_template)))
        notes.append("convert {{la-verb}} params to new style")
        subsections[k] = unicode(parsed)

    if not saw_a_template:
        pagemsg("WARNING: Saw no verb headword or conjugation templates")

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
Exemplo n.º 17
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Italian",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    need_ref_section = False

    for k in xrange(2, len(subsections), 2):
        if "==Pronunciation==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])

            all_pronun_templates = []
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it":
                    all_pronun_templates.append(t)

            saw_it_pr = False
            pronun_based_respellings = []
            for t in parsed.filter_templates():
                origt = unicode(t)

                def tmsg(txt):
                    other_templates = []
                    for t in all_pronun_templates:
                        thist = unicode(t)
                        if thist != origt:
                            other_templates.append(thist)
                    pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" %
                                          ", ".join(other_templates)
                                          if len(other_templates) > 0 else ""))

                tn = tname(t)
                if tn == "it-pr":
                    saw_it_pr = True
                    respellings = blib.fetch_param_chain(t, "1")
                    # FIXME, need to split on comma
                    pronun_based_respellings.extend(respellings)
                    break
                if tn == "IPA" and getparam(t, "1") == "it":
                    saw_it_pr = True
                    pronuns = blib.fetch_param_chain(t, "2")
                    this_phonemic_pronun = None
                    this_phonemic_respelling = None
                    this_phonetic_pronun = None
                    this_phonetic_respelling = None
                    respellings = []
                    all_warnings = []
                    hack_respelling_warnings = []
                    main_warnings = []
                    unable = [False]
                    for pronun in pronuns:
                        respelling = ipa_to_respelling(pronun)
                        respelling, this_hack_respelling_warnings = hack_respelling(
                            pagetitle, respelling)
                        hack_respelling_warnings.extend(
                            this_hack_respelling_warnings)

                        def set_unable(msg):
                            main_warnings.append(msg)
                            unable[0] = True

                        tmsg("For pronun %s, generated respelling %s" %
                             (pronun, respelling))
                        respelling_words = respelling.split(" ")
                        for rw in respelling_words:
                            if rw.endswith("-"):  # prefix
                                continue
                            hacked_rw = re.sub(
                                u".[\u0323\u0331]", "e", rw
                            )  # pretend vowels with secondary or no stress are 'e'
                            if not re.search(
                                    u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len(
                                        re.sub("[^aeiouAEIOU]", "",
                                               hacked_rw)) > 1:
                                set_unable(
                                    "WARNING: For respelling %s for pronun %s, word %s is missing stress"
                                    % (respelling, pronun, rw))
                        if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$",
                                         respelling):
                            set_unable(
                                "WARNING: Strange char in respelling %s for pronun %s"
                                % (respelling, pronun))
                        else:
                            putative_pagetitle = re.sub(
                                u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])",
                                lambda m: vowel_respelling_to_spelling[m.group(
                                    1)] + m.group(2), respelling)
                            pagetitle_words = pagetitle.split(" ")
                            putative_pagetitle_words = putative_pagetitle.split(
                                " ")
                            if len(pagetitle_words) != len(
                                    putative_pagetitle_words):
                                set_unable(
                                    "WARNING: Page title has %s words but putative page title %s has %s words"
                                    %
                                    (len(pagetitle_words), putative_pagetitle,
                                     len(putative_pagetitle_words)))
                            else:
                                hacked_putative_pagetitle_words = []
                                for ptw, puptw in zip(
                                        pagetitle_words,
                                        putative_pagetitle_words):
                                    split_ptw = re.split("([Zz]+)", ptw)
                                    split_puptw = re.split(
                                        "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw)
                                    if len(split_ptw) != len(split_puptw):
                                        set_unable(
                                            "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s"
                                            % (ptw, puptw))
                                        hacked_putative_pagetitle_words.append(
                                            puptw)
                                    else:
                                        parts = []
                                        for i in xrange(len(split_puptw)):
                                            if i % 2 == 0:
                                                parts.append(split_puptw[i])
                                            else:
                                                parts.append(split_ptw[i])
                                        hacked_putative_pagetitle_words.append(
                                            "".join(parts))
                                putative_pagetitle = " ".join(
                                    hacked_putative_pagetitle_words)
                                if putative_pagetitle != pagetitle:
                                    # If respelling already seen, we already warned about it.
                                    if respelling in respellings:
                                        assert unable[0]
                                    else:
                                        set_unable(
                                            "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)"
                                            % (respelling, putative_pagetitle,
                                               pronun))

                        def append_respelling(respelling):
                            if respelling not in respellings:
                                respellings.append(respelling)

                        def append_warnings(warning):
                            if warning:
                                all_warnings.append(warning)
                            for warning in hack_respelling_warnings:
                                all_warnings.append(warning)
                            del hack_respelling_warnings[:]
                            for warning in main_warnings:
                                all_warnings.append(warning)
                            del main_warnings[:]

                        append_respelling(respelling)
                        if pronun.startswith("/"):
                            if this_phonemic_pronun is not None:
                                append_warnings(
                                    "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun"
                                    % (this_phonemic_pronun,
                                       this_phonemic_respelling, pronun,
                                       respelling))
                            this_phonemic_pronun = pronun
                            this_phonemic_respelling = respelling
                            this_phonetic_pronun = None
                            this_phonetic_respelling = None
                        elif pronun.startswith("["):
                            if this_phonemic_pronun is None:
                                if this_phonetic_pronun is not None:
                                    unable[0] = True
                                    append_warnings(
                                        "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun"
                                        % (this_phonetic_pronun,
                                           this_phonetic_respelling, pronun,
                                           respelling))
                                else:
                                    append_warnings(
                                        "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun"
                                        % (pronun, respelling))
                                this_phonetic_pronun = pronun
                                this_phonetic_respelling = respelling
                            elif this_phonemic_respelling != respelling:
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)"
                                    %
                                    (this_phonemic_respelling,
                                     this_phonemic_pronun, respelling, pronun))
                            else:
                                if unable[0] and len(main_warnings) > 0:
                                    # `unable` could be set from a previous pronunciation but no main warnings this time around
                                    # because the previously generated warnings have already been appended to all_warnings.
                                    mesg = main_warnings[0]
                                    del main_warnings[0]
                                    append_warnings(mesg)
                                else:
                                    append_warnings(None)
                            this_phonemic_pronun = None
                            this_phonemic_respelling = None
                        else:
                            unable[0] = True
                            append_warnings(
                                "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic"
                                % (pronun, respelling))
                    if this_phonemic_pronun is not None:
                        append_warnings(
                            "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun"
                            % (this_phonemic_pronun, this_phonemic_respelling))
                    if not unable[0]:
                        for param in t.params:
                            pn = pname(param)
                            if not re.search("^[0-9]+$",
                                             pn) and pn != "nocount":
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Saw unrecognized param %s=%s" %
                                    (pn, unicode(param.value)))
                    manual_assist = ""
                    if unable[0]:
                        if pagetitle in ipa_directives:
                            respellings = ipa_directives[pagetitle]
                            unable[0] = False
                            manual_assist = " (manually assisted)"
                            tmsg(
                                "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s"
                                % ("[MULTIPLE PRONUN TEMPLATES] "
                                   if len(all_pronun_templates) > 1 else "",
                                   "s" if len(respellings) > 1 else "",
                                   ",".join(respellings),
                                   " ||| ".join(all_warnings)))
                        else:
                            tmsg("%s<respelling> %s <end> %s" %
                                 ("[MULTIPLE PRONUN TEMPLATES] "
                                  if len(all_pronun_templates) > 1 else "",
                                  " ".join(respellings),
                                  " ||| ".join(all_warnings)))
                    if not unable[0]:
                        del t.params[:]
                        nextparam = 0
                        for param in respellings:
                            if "=" in param:
                                paramname, paramval = param.split("=", 1)
                            else:
                                nextparam += 1
                                paramname = str(nextparam)
                                paramval = param
                            if re.search("^n[0-9]*$", paramname):
                                need_ref_section = True
                            t.add(paramname, paramval)
                        blib.set_template_name(t, "it-pr")
                        notes.append(
                            "replace raw {{IPA|it}} with {{it-pr|%s}}%s" %
                            ("|".join(respellings), manual_assist))
                    pronun_based_respellings.extend(respellings)
                if unicode(t) != origt:
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            subsections[k] = unicode(parsed)

            rhymes_template = None
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it":
                    if rhymes_template:
                        pagemsg(
                            "WARNING: Saw two {{rhymes|it}} templates: %s and %s"
                            % (unicode(rhymes_template), unicode(t)))
                    rhymes_template = t
            if rhymes_template:
                rhyme_based_respellings = []
                all_warnings = []

                def append_respelling(respelling):
                    if respelling not in rhyme_based_respellings:
                        rhyme_based_respellings.append(respelling)

                def append_warnings(warning):
                    all_warnings.append(warning)

                rhymes = blib.fetch_param_chain(rhymes_template, "2")
                unable = False
                for rhy in rhymes:
                    spellings = rhyme_to_spelling(rhy)
                    matched = False
                    bad_rhyme_msgs = []
                    for ending, ending_respelling in spellings:
                        if pagetitle.endswith(ending):
                            prevpart = pagetitle[:-len(ending)]
                            respelling = prevpart + ending_respelling
                            saw_oso_ese = False
                            if ending_respelling == u"óso":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"ó[s]o")
                            elif ending_respelling == u"ése":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"é[s]e")
                            else:
                                if respelling.endswith(u"zióne"):
                                    new_respelling = re.sub(
                                        u"zióne$", u"tsióne", respelling)
                                    pagemsg(
                                        "Replaced respelling '%s' with '%s'" %
                                        (respelling, new_respelling))
                                    respelling = new_respelling
                                    prevpart = respelling[:-len(
                                        ending)] + ending_respelling
                                append_respelling(respelling)
                            if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)",
                                          prevpart.lower())
                                    or not saw_oso_ese and re.search(
                                        u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]",
                                        ending_respelling.lower())):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s"
                                    % rhy)
                                unable = True
                                break
                            if "z" in prevpart:
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to z in part before rhyme: %s"
                                    % rhy)
                                unable = True
                                break
                            hacked_prevpart = re.sub("([gq])u", r"\1w",
                                                     prevpart)
                            hacked_prevpart = hacked_prevpart.replace(
                                "gli", "gl")
                            hacked_prevpart = re.sub("([cg])i", r"\1",
                                                     hacked_prevpart)
                            if re.search("[^aeiou][iu]([aeiou]|$)",
                                         hacked_prevpart.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s"
                                    % rhy)
                                unable = True
                                break
                            if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)",
                                         respelling.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s"
                                    % rhy)
                                unable = True
                                break
                            matched = True
                            break
                        else:
                            bad_rhyme_msgs.append(
                                "WARNING: Unable to match rhyme %s, spelling %s, respelling %s"
                                % (rhy, ending, ending_respelling))
                    if not matched and not unable and bad_rhyme_msgs:
                        for bad_rhyme_msg in bad_rhyme_msgs:
                            pagemsg(bad_rhyme_msg)
                if rhyme_based_respellings:
                    if not saw_it_pr:
                        manual_assist = ""
                        if pagetitle in rhyme_directives:
                            rhyme_based_respellings = rhyme_directives[
                                pagetitle]
                            manual_assist = " (manually assisted)"
                            pagemsg(
                                "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s"
                                % ("s" if len(rhyme_based_respellings) > 1 else
                                   "", ",".join(rhyme_based_respellings),
                                   " ||| ".join(all_warnings),
                                   unicode(rhymes_template)))
                            subsections[k] = "* {{it-pr|%s}}\n" % ",".join(
                                rhyme_based_respellings) + subsections[k]
                            notes.append(
                                "add Italian rhyme-based respelling%s %s%s" %
                                ("s" if len(rhyme_based_respellings) > 1 else
                                 "", ",".join(rhyme_based_respellings),
                                 manual_assist))
                        else:
                            different_headers = []
                            for pos in [
                                    "Noun", "Verb", "Adjective", "Adverb",
                                    "Participle"
                            ]:
                                if "==%s==" % pos in secbody:
                                    different_headers.append(pos)
                            if len(different_headers) > 1:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple headers %s seen" %
                                    ",".join(different_headers)
                                ]
                            if "Etymology 1" in secbody:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple etymologies seen"
                                ]

                            pagemsg(
                                "<respelling> all: %s <end>%s: <from> %s <to> %s <end>"
                                % (" ".join(rhyme_based_respellings), " " +
                                   " ||| ".join(all_warnings) if all_warnings
                                   else "", unicode(rhymes_template),
                                   unicode(rhymes_template)))
                    else:
                        for respelling in rhyme_based_respellings:
                            if (not re.search("^qual[0-9]*=", respelling)
                                    and pronun_based_respellings and respelling
                                    not in pronun_based_respellings):
                                pagemsg(
                                    "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s"
                                    %
                                    (" (with problems)" if
                                     len(all_warnings) > 0 else "", respelling,
                                     ",".join(pronun_based_respellings),
                                     ": %s" % " ||| ".join(all_warnings)
                                     if len(all_warnings) > 0 else ""))

    if need_ref_section:
        for k in xrange(len(subsections) - 1, 2, -2):
            if re.search(r"^===\s*References\s*===$",
                         subsections[k - 1].strip()):
                if not re.search(r"<references\s*/?\s*>", subsections[k]):
                    subsections[k] = subsections[k].rstrip(
                        "\n") + "\n<references />\n\n"
                    notes.append(
                        "add <references /> to existing ===References=== section for pronunciation refs"
                    )
                break
        else:  # no break
            for k in xrange(len(subsections) - 1, 2, -2):
                if not re.search(r"==\s*(Anagrams|Further reading)\s*==",
                                 subsections[k - 1]):
                    subsections[k + 1:k + 1] = [
                        "===References===\n", "<references />\n\n"
                    ]
                    notes.append(
                        "add new ===References=== section for pronunciation refs"
                    )
                    break
            else:  # no break
                pagemsg(
                    "WARNING: Something wrong, couldn't find location to insert ===References=== section"
                )

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Exemplo n.º 18
0
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg):
  notes = []

  def analyze_declts(declts, pagetitle, headword_gens, headword_pls):
    decl_genders_gens_and_pls = []
    prev_is_weak = None
    prev_is_sg = None
    for declt in declts:
      def getp(param):
        return getparam(declt, param)
      tn = tname(declt)
      gender = re.sub(".*-", "", tn)
      if gender == "pl":
        gender = "p"
      decl_gens = []
      decl_pls = []
      if gender != "p":
        is_weak = False
        is_sg = False
        for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]:
          if getp(param):
            pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt)))
            return None
        if gender in ["m", "n"]:
          arg1 = getp("1")
          if not arg1:
            gen = ""
          elif arg1 in ["n", "ns", "en", "ens"]:
            is_weak = True
            gen = arg1
          elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]:
            gen = arg1
          else:
            pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt)))
            return None
          decl_gens = convert_gens(pagetitle, [gen], from_decl=True)
        num = getp("n")
        if num == "sg":
          is_sg = True
        elif num not in ["full", ""]:
          pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt)))
          return None
        if not is_sg:
          if gender == "f":
            plsuffix = getp("1")
          else:
            plsuffix = getp("2")
          argpl = getp("pl")
          if argpl:
            pl = argpl
          else:
            pl = pagetitle + plsuffix
          if pl == "-":
            is_sg = True
          else:
            decl_pls = normalize_values([pl])
        if prev_is_weak is not None and prev_is_weak != is_weak:
          pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s"
              % (is_weak, prev_is_weak, declts_to_unicode(declts)))
          return None
        prev_is_weak = is_weak
        if prev_is_sg is not None and prev_is_sg != is_sg:
          pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s"
              % (is_sg, prev_is_sg, declts_to_unicode(declts)))
          return None
        prev_is_sg = is_sg
      decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls))

    all_decl_genders = []
    all_decl_gens = []
    all_decl_pls = []
    for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls:
      if decl_gender not in all_decl_genders:
        all_decl_genders.append(decl_gender)
      for decl_gen in decl_gens:
        if decl_gen not in all_decl_gens:
          all_decl_gens.append(decl_gen)
      for decl_pl in decl_pls:
        if decl_pl not in all_decl_pls:
          all_decl_pls.append(decl_pl)
    first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0]
    if len(all_decl_genders) > 1 and (
      len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls)
    ):
      pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s"
          % declts_to_unicode(declts))
      return None
    if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls):
      pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s"
          % declts_to_unicode(declts))
      return None

    is_weak = prev_is_weak
    is_sg = prev_is_sg
    declspec = ":".join(all_decl_genders)

    def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc):
      defparts = []
      for gender in all_decl_genders:
        defpart = pagetitle + get_default_part(pagetitle, gender, is_weak)
        if defpart not in defparts:
          defparts.append(defpart)
      if all_decl_parts == defparts:
        declspec += ","
      else:
        all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None)
        if set(headword_parts) == set(all_decl_parts):
          headword_part_forms = analyze_forms(pagetitle, headword_parts, None)
          if headword_part_forms != all_decl_part_forms:
            pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s"
                % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms,
                  declts_to_unicode(declts)))
            all_decl_part_forms = headword_part_forms
        else:
          pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing"
              % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts)))
        declspec += ",%s" % all_decl_part_forms
      return declspec

    if "m" in all_decl_genders or "n" in all_decl_genders:
      declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive")
    if "p" not in all_decl_genders:
      declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural")
    declspec = re.sub(",*$", "", declspec)
    if is_weak:
      declspec += ".weak"
    if is_sg:
      declspec += ".sg"
    if ss:
      declspec += ".ss"
    return declspec, all_decl_genders, all_decl_gens, all_decl_pls

  old_style_headt = False
  for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]:
    if getparam(headt, param):
      old_style_headt = True
      break
  if not old_style_headt:
    pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt),
      declts and ", declts=%s" % declts_to_unicode(declts) or ""))
    return notes

  is_proper = tname(headt) == "de-proper noun"
  ss = False
  if declts:
    sses = [not not getparam(declt, "ss") for declt in declts]
    if len(set(sses)) > 1:
      pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts))
      return
    ss = list(set(sses)) == [True]
  if ss:
    if not pagetitle.endswith(u"ß"):
      pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts))
      return
    # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the
    # headword and declension specs.
    pagetitle = re.sub(u"ß$", "ss", pagetitle)

  adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts)
  genders = blib.fetch_param_chain(headt, "1", "g")
  headword_genders = genders
  gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True))
  pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl"))
  dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim"))
  fems = normalize_values(blib.fetch_param_chain(headt, "f"))
  mascs = normalize_values(blib.fetch_param_chain(headt, "m"))
  if gens == [True]:
    gens = []
  for param in headt.params:
    pn = pname(param)
    pv = unicode(param.value)
    if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and (
        not adjectival or pn not in "head"):
      pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt)))
      return
  if not genders:
    pagemsg("WARNING: No genders in head template: %s" % unicode(headt))
    return
  if "p" in genders and len(genders) > 1:
    pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt))
    return
  if "p" in genders and (gens or pls):
    pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt))
    return
  saw_mn = "m" in genders or "n" in genders
  if not saw_mn and not adjectival:
    if gens and gens == [pagetitle]:
      gens = []
    if gens:
      pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt))
      return

  if adjectival:
    if len(declts) > 1:
      pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s"
        % declts_to_unicode(declts))
      return
    declt = declts[0]
    def getp(param):
      return getparam(declt, param)
    tn = tname(declt)
    m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn)
    if m:
      default_equiv = None
      is_sg, gender = m.groups()
      adj = getp("1")
      noun = getp("2")
      if gender in ["m", "f"]:
        default_equiv = adj + ("e" if gender == "m" else "er")
        if noun:
          default_equiv += " " + construct_default_equiv(noun, gender)
      if gender in ["m", "n"]:
        noun_gen = getp("3")
        noun_pl = getp("4")
      else:
        noun_gen = "-"
        noun_pl = getp("3")
      noun_pl_full = getp("pl")
      adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es"
      expected_lemma = adj + adj_ending
      if gender == "f":
        # Should be '-er' but we often see '-en' (weak form) instead
        expected_gens = [adj + "er", adj + "en"]
      else:
        expected_gens = [adj + "en"]
      if is_sg:
        expected_pls = []
      else:
        expected_pls = [adj + "e", adj + "en"]
      if not noun:
        if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"):
          pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt))
          return
        all_decl_genders = [gender]
      else:
        fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "")
        fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0]
        def analyze_headword_parts_for_noun(parts, desc):
          noun_headword_parts = []
          for part in parts:
            m = re.search("^([^ ]+) ([^ ]+)$", part.strip())
            if not m:
              pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s"
                  % (desc, part, unicode(headt), unicode(declt)))
              return []
            part_adj, part_noun = m.groups()
            noun_headword_parts.append(part_noun)
          return noun_headword_parts
        noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive")
        noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural")

        retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls)
        if retval is None:
          return
        declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
        expected_lemma = "%s %s" % (expected_lemma, noun)
        expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)]
        if is_sg:
          expected_pls = []
        else:
          expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls]
      if pagetitle != expected_lemma:
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s"
            % (expected_lemma, pagetitle, unicode(headt), unicode(declt)))
        return
      if set(genders) != set(all_decl_genders):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt)))
        return
      if not (set(gens) <= set(expected_gens)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt)))
        return
      if pls == ["-"]:
        if expected_pls:
          pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
              % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
          return
      elif not (set(pls) <= set(expected_pls)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
        return
      if not noun:
        declspec = "+"
        if is_sg:
          declspec += ".sg"
      else:
        if re.search("^" + CAP, adj):
          adj_lemma = adj.lower()
        else:
          adj_lemma = adj
        if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]:
          adj_lemma += "e"
        adj_form = adj + adj_ending
        if adj_form.startswith(adj_lemma):
          adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):])
        else:
          adj_link = "[[%s|%s]]" % (adj_lemma, adj_form)
        noun_link = "[[%s]]" % noun
        # This is less accurate than the above. Often head= is wrong.
        # Try to update adjective and noun links from head= if given.
        #head = getparam(headt, "head")
        #if head:
        #  m = re.search("^([^ ]*) ([^ ]*)$", head)
        #  if not m:
        #    pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s"
        #        % (head, unicode(headt), unicode(declt)))
        #  else:
        #    head_adj_link, head_noun_link = m.groups()
        #    m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link)
        #    if m:
        #      adj_link_lemma, adj_link_form = m.groups()
        #      if adj_link_form.startswith(adj_link_lemma):
        #        head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):])
        #    if head_adj_link != adj_link:
        #      pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s"
        #          % (head_adj_link, adj_link, unicode(headt), unicode(declt)))
        #      adj_link = head_adj_link
        #    if head_noun_link != noun_link:
        #      pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s"
        #          % (head_noun_link, noun_link, unicode(headt), unicode(declt)))
        #      noun_link = head_noun_link
        declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec)
      headspec = declspec
      is_both = is_proper and not is_sg
    else:
      pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts))
      return

  else: # not adjectival
    if len(genders) == 1 and genders[0] in ["m", "f"]:
      default_equiv = construct_default_equiv(pagetitle, genders[0])
    headspec = ":".join(genders)
    is_sg = False
    is_both = False
    is_weak = False
    headword_gens = []
    headword_pls = []
    if headspec != "p":
      pls = convert_pls(pagetitle, pls, is_proper=is_proper)
      headword_pls = pls
      if saw_mn:
        gens = convert_gens(pagetitle, gens)
        headword_gens = gens
        if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"])
          and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))):
          is_weak = True
        def_gens = []
        for gender in genders:
          def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak)
          if def_gen not in def_gens:
            def_gens.append(def_gen)
        if set(def_gens) == set(gens):
          headspec += ","
        else:
          headspec += ",%s" % analyze_forms(pagetitle, gens, None)
      def_pls = []
      for gender in genders:
        def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak)
        if def_pl not in def_pls:
          def_pls.append(def_pl)
      if set(def_pls) == set(pls):
        headspec += ","
        if is_proper:
          is_both = True
      elif pls == ["-"]:
        is_sg = True
      else:
        headspec += ",%s" % analyze_forms(pagetitle, pls, None)
    headspec = re.sub(",*$", "", headspec)
    if is_weak:
      headspec += ".weak"
    if is_sg:
      headspec += ".sg"
    if ss:
      headspec += ".ss"

  extraspec = ""
  if dims:
    extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",")
  if fems:
    extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",")
  if mascs:
    extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",")

  if declts and not adjectival:
    retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls)
    if retval is None:
      return
    declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
    if headspec != declspec:
      if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls):
        if set(all_decl_genders) == set(headword_genders):
          pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt)))
          declspec = headspec
        else:
          pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))

          return
      else:
        pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s"
            % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
              ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))
        return

  if is_proper:
    headspec = headspec.replace(".sg", "")
    if is_both:
      if ".ss" in headspec:
        headspec = headspec.replace(".ss", ".both.ss")
      else:
        headspec += ".both"
  newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec)
  headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt)
  outmsg = "Would " + headt_outmsg
  if declts:
    newdeclt = "{{de-ndecl|%s}}" % declspec
    declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt)
    outmsg += " and " + declt_outmsg
  pagemsg(outmsg)

  if unicode(headt) != newheadt:
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(headt_outmsg)
    subsections[subsection_with_head] = newsectext
  if declts:
    declts_existing = "\n".join(unicode(declt) for declt in declts)
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(declt_outmsg)
    subsections[subsection_with_declts] = newsectext

  return notes
Exemplo n.º 19
0
def process_line(index, line, online):
    global args
    line = line.strip()
    m = re.search(
        r"^Page [0-9]+ (.*?): WARNING: Saw noun headword template.*: (\{\{la-(?:proper )?noun\|.*?\}\})$",
        line)
    if not m:
        msg("Unrecognized line, skipping: %s" % line)
        return
    pagetitle, noun_headword_template = m.groups()

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    t = blib.parse_text(noun_headword_template).filter_templates()[0]
    if getparam(t, "indecl"):
        pagemsg("Skipping indeclinable noun: %s" % unicode(t))
        return
    lemma = blib.fetch_param_chain(t, ["1", "head", "head1"],
                                   "head") or [pagetitle]
    genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen")
    noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g")
    noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl")
    if " " in lemma[0]:
        pagemsg("WARNING: Space in lemma %s, skipping: %s" %
                (lemma[0], unicode(t)))
        return
    if len(lemma) > 1:
        pagemsg("WARNING: Multiple lemmas %s, skipping: %s" %
                (",".join(lemma), unicode(t)))
        return
    lemma = lemma[0]
    noun_decl_to_decl_type = {
        "first": "1",
        "second": "2",
        "third": "3",
        "fourth": "4",
        "fifth": "5",
        "irregular": "irreg",
    }
    if len(noun_decl) == 0:
        pagemsg("WARNING: No declension, skipping: %s" % unicode(t))
        return
    if len(noun_decl) > 1:
        pagemsg("WARNING: Multiple decls %s, skipping: %s" %
                (",".join(noun_decl), unicode(t)))
        return
    noun_decl = noun_decl[0]
    if noun_decl not in noun_decl_to_decl_type:
        pagemsg("WARNING: Unrecognized declension %s, skipping: %s" %
                (noun_decl, unicode(t)))
        return
    decl_type = noun_decl_to_decl_type[noun_decl]
    if decl_type in ["1", "2", "4", "5"]:
        la_ndecl = "{{la-ndecl|%s<%s>}}" % (lemma, decl_type)
    elif decl_type == "3":
        if len(genitive) == 0:
            pagemsg(
                "WARNING: No genitives with decl 3 lemma %s, skipping: %s" %
                (lemma, unicode(t)))
            return
        elif len(genitive) > 1:
            pagemsg(
                "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s"
                % (",".join(genitive), lemma, unicode(t)))
            return
        else:
            gen1 = genitive[0]
            if gen1.endswith("is"):
                stem = gen1[:-2]
                if lalib.infer_3rd_decl_stem(lemma) == stem:
                    la_ndecl = "{{la-ndecl|%s<3>}}" % lemma
                else:
                    la_ndecl = "{{la-ndecl|%s/%s<3>}}" % (lemma, stem)
            elif gen1.endswith("ium"):
                if lemma.endswith("ia"):
                    la_ndecl = "{{la-ndecl|%s<3.pl>}}" % lemma
                elif lemma.endswith(u"ēs"):
                    la_ndecl = "{{la-ndecl|%s<3.I.pl>}}" % lemma
                else:
                    pagemsg(
                        "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s"
                        % (lemma, unicode(t)))
                    return
            elif gen1.endswith("um"):
                if lemma.endswith("a") or lemma.endswith(u"ēs"):
                    la_ndecl = "{{la-ndecl|%s<3.pl>}}" % lemma
                else:
                    pagemsg(
                        "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s"
                        % (lemma, unicode(t)))
                    return
            else:
                pagemsg(
                    "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s"
                    % (gen1, lemma, unicode(t)))
                return
    elif decl_type == "irreg":
        pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" %
                unicode(t))
        return
    else:
        pagemsg(
            "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s"
            % (decl_type, unicode(t)))
        return
    pagemsg("For noun %s, declension %s" % (unicode(t), la_ndecl))
    if online:
        noun_props = convert_la_headword_noun.new_generate_noun_forms(
            la_ndecl, errandpagemsg, expand_text)
        if noun_props is None:
            return
        convert_la_headword_noun.compare_headword_decl_forms(
            "genitive",
            genitive, ["gen_sg", "gen_pl"],
            noun_props,
            "headword=%s, decl=%s" % (unicode(t), la_ndecl),
            pagemsg,
            adjust_for_missing_gen_forms=True,
            remove_headword_links=True)
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = blib.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = blib.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword:
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runounlib.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runounlib.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Exemplo n.º 21
0
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None):
    if not expand_text:

        def expand_text(tempcall):
            return blib.expand_text(tempcall, pagename, pagemsg, False)

    tn = tname(t)
    if tn in [
            "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det",
            "la-pronoun"
    ]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            retval = getparam(t, "1")
            if "<" in retval or "((" in retval or " " in retval or "-" in retval:
                generate_template = blib.parse_text(
                    unicode(t)).filter_templates()[0]
                blib.set_template_name(generate_template,
                                       "la-generate-adj-forms")
                blib.remove_param_chain(generate_template, "comp", "comp")
                blib.remove_param_chain(generate_template, "sup", "sup")
                blib.remove_param_chain(generate_template, "adv", "adv")
                blib.remove_param_chain(generate_template, "lemma", "lemma")
                rmparam(generate_template, "type")
                # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
                rmparam(generate_template, "indecl")
                rmparam(generate_template, "id")
                rmparam(generate_template, "pos")
                result = expand_text(unicode(generate_template))
                if not result:
                    pagemsg("WARNING: Error generating forms, skipping")
                    retval = ""
                else:
                    args = blib.split_generate_args(result)
                    if "linked_nom_sg_m" in args:
                        retval = args["linked_nom_sg_m"]
                    elif "linked_nom_pl_m" in args:
                        retval = args["linked_nom_pl_m"]
                    else:
                        pagemsg(
                            "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s"
                            % (unicode(generate_template), result))
                        retval = ""
                    retval = retval.split(",")
            else:
                retval = re.sub("/.*", "", retval)
    elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-noun-forms")
            blib.remove_param_chain(generate_template, "lemma", "lemma")
            blib.remove_param_chain(generate_template, "m", "m")
            blib.remove_param_chain(generate_template, "f", "f")
            blib.remove_param_chain(generate_template, "g", "g")
            rmparam(generate_template, "type")
            # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
            rmparam(generate_template, "indecl")
            rmparam(generate_template, "id")
            rmparam(generate_template, "pos")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                if "linked_nom_sg" in args:
                    retval = args["linked_nom_sg"]
                elif "linked_nom_pl" in args:
                    retval = args["linked_nom_pl"]
                else:
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in ["la-verb", "la-suffix-verb"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-verb-forms")
            rmparam(generate_template, "id")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                for slot in [
                        "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc",
                        "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc"
                ]:
                    if slot in args:
                        retval = args[slot]
                        break
                else:
                    # no break
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or (
            tn in ["la-suffix", "la-suffix-adv", "la-gerund"]):
        retval = getparam(t, "1")
    elif tn == "la-letter":
        retval = pagename
    elif tn in ["head", "la-prep"]:
        retval = blib.fetch_param_chain(t, "head", "head")
    elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates:
        retval = blib.fetch_param_chain(t, "1", "head")
    else:
        pagemsg("WARNING: Unrecognized headword template %s" % unicode(t))
        retval = ""
    retval = retval or pagename
    if type(retval) is not list:
        retval = [retval]
    return retval
Exemplo n.º 22
0
 def handle_mf(mf, mf_full, make_mf):
     mfs = blib.fetch_param_chain(t, mf, mf)
     mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl")
     if mfs and not any(x.startswith("+") for x in mfs):
         defmf = make_mf(lemma)
         if set(mfs) == {defmf}:
             defpls = make_plural(defmf)
             ok = False
             if not mfpls or set(mfpls) == set(defpls):
                 ok = True
             elif set(mfpls) < set(defpls):
                 pagemsg(
                     "WARNING: %pl=%s subset of default=%s, allowing"
                     % (mf, ",".join(mfpls), ",".join(defpls)))
                 ok = True
             if ok:
                 notes.append(
                     "replace %s=%s with '+' in {{es-noun}}" %
                     (mf, ",".join(mfs)))
                 blib.set_param_chain(t, ["+"], mf, mf)
                 blib.remove_param_chain(t, mf + "pl", mf + "pl")
                 return
         actual_special = None
         for special in all_specials:
             special_mf = make_mf(lemma, special)
             if special_mf is None:
                 continue
             if mfs == [special_mf]:
                 pagemsg("Found special=%s with special_mf=%s" %
                         (special, special_mf))
                 actual_special = special
                 break
         if actual_special:
             if not mfpls:
                 pagemsg(
                     "WARNING: Explicit %s=%s matches special=%s but no %s plural"
                     % (mf, ",".join(mfs), actual_special, mf_full))
             else:
                 special_mfpl = make_plural(special_mf,
                                            actual_special)
                 if special_mfpl:
                     if len(special_mfpl) > 1 and set(mfpls) < set(
                             special_mfpl):
                         pagemsg(
                             "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf,
                                ",".join(special_mfpl)))
                     elif set(mfpls) == set(special_mfpl):
                         pagemsg(
                             "Found %s=%s and special=%s, %spls=%s matches special_%spl"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf))
                     else:
                         pagemsg(
                             "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf,
                                ",".join(special_mfpl)))
                         actual_special = None
             if actual_special:
                 notes.append(
                     "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural"
                     % (mf_full, ",".join(mfs), actual_special,
                        mf_full))
                 blib.set_param_chain(t, ["+%s" % actual_special],
                                      mf, mf)
                 blib.remove_param_chain(t, mf + "pl", mf + "pl")
         if not actual_special:
             defmf = make_mf(lemma)
             mfs_with_def = ["+" if x == defmf else x for x in mfs]
             if mfs_with_def != mfs:
                 notes.append(
                     "replace default %s %s with '+' in {{es-noun}}"
                     % (mf_full, defmf))
                 blib.set_param_chain(t, mfs_with_def, mf, mf)
             if mfpls:
                 defpl = [
                     x for y in mfs for x in (make_plural(y) or [])
                 ]
                 ok = False
                 if set(defpl) == set(mfpls):
                     ok = True
                 elif len(defpl) > 1 and set(mfpls) < set(defpl):
                     pagemsg(
                         "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing"
                         % (mf, ",".join(mfs), mf, ",".join(mfpls),
                            ",".join(defpl)))
                     ok = True
                 if ok:
                     pagemsg(
                         "Found %s=%s, %spl=%s matches default pl" %
                         (mf, ",".join(mfs), mf, ",".join(mfpls)))
                     notes.append(
                         "remove redundant explicit %s plural %s in {{es-noun}}"
                         % (mf_full, ",".join(mfpls)))
                     blib.remove_param_chain(
                         t, mf + "pl", mf + "pl")
                 else:
                     for special in all_specials:
                         defpl = [
                             x for y in mfs for x in (
                                 make_plural(y, special) or [])
                         ]
                         if set(defpl) == set(mfpls):
                             pagemsg(
                                 "Found %s=%s, %spl=%s matches special=%s"
                                 % (mf, ",".join(mfs), mf,
                                    ",".join(mfpls), special))
                             notes.append(
                                 "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}"
                                 % (mf_full, ",".join(mfpls),
                                    special))
                             blib.set_param_chain(
                                 t, ["+%s" % special], mf + "pl",
                                 mf + "pl")
Exemplo n.º 23
0
        return True

    def compare_genders(g1, g2):
        if set(g1) == set(g2):
            return True
        if len(g1) == 1 and len(g2) == 1:
            # If genders don't match exactly, check if existing gender is missing
            # animacy and allow that, so it gets overwritten with new gender
            if g1[0] == re.sub("-(an|in)", "", g2[0]):
                pagemsg(
                    "Existing gender %s missing animacy spec compared with proposed %s, allowed"
                    % (",".join(g1), ",".join(g2)))
                return True
        return None

    headwords = blib.fetch_param_chain(headword_template, "1", "head",
                                       subpagetitle)
    translits = blib.fetch_param_chain(headword_template, "tr", "tr")
    for i in xrange(len(translits)):
        if len(headwords) <= i:
            pagemsg(
                "WARNING: Not enough headwords for translit tr%s=%s, skipping"
                % ("" if i == 0 else str(i + 1), translits[i]))
            return None
        else:
            headwords[i] += "//" + translits[i]
    genitives = blib.fetch_param_chain(headword_template, "3", "gen")
    plurals = blib.fetch_param_chain(headword_template, "4", "pl")
    genders = blib.fetch_param_chain(headword_template, "2", "g")
    cases_to_check = None
    if args["n"] == "s":
        if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True)
Exemplo n.º 24
0
def process_page(page, index, fixdirecs):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    saw_paired_verb = False
    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-verb":
            saw_paired_verb = False
            if getparam(t, "2") in ["impf", "both"]:
                verb = getparam(t, "1") or pagetitle
                pfs = blib.fetch_param_chain(t, "pf", "pf")
                impfs = blib.fetch_param_chain(t, "impf", "impf")
                for otheraspect in pfs + impfs:
                    if verb[0:2] == otheraspect[0:2]:
                        saw_paired_verb = True
        if (unicode(t.name) in ["ru-conj", "ru-conj-old"]
                and getparam(t, "1") == "impf" and not saw_paired_verb):
            if getparam(t, "ppp") or getparam(t, "past_pasv_part"):
                pass
            elif [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
                        unicode(t))
                pass
            elif re.search(r"\+p|\[?\([78]\)\]?", getparam(t, "2")):
                pass
            else:
                pagemsg(
                    "Apparent unpaired transitive imperfective without PPP")
                if pagetitle in fixdirecs:
                    direc = fixdirecs[pagetitle]
                    assert direc in [
                        "fixed", "paired", "intrans", "+p", "|ppp=-"
                    ]
                    origt = unicode(t)
                    if direc == "+p":
                        t.add("2", getparam(t, "2") + "+p")
                        notes.append(
                            "add missing past passive participle to transitive unpaired imperfective verb"
                        )
                        pagemsg("Add missing PPP, replace %s with %s" %
                                (origt, unicode(t)))
                    elif direc == "|ppp=-":
                        t.add("ppp", "-")
                        notes.append(
                            "note transitive unpaired imperfective verb as lacking past passive participle"
                        )
                        pagemsg("Note no PPP, replace %s with %s" %
                                (origt, unicode(t)))
                    elif direc == "paired":
                        pagemsg("Verb actually is paired")
                    elif direc == "fixed":
                        pagemsg("WARNING: Unfixed verb marked as fixed")
                    elif direc == "intrans":
                        pagemsg("WARNING: Transitive verb marked as intrans")

    return unicode(parsed), notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc):
  notes = []
  pagetitle = unicode(nounpage.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  text = unicode(nounpage.text)
  retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % (
      desc, ",".join(new_adj_or_dims)))
    return
  sections, j, secbody, sectail, has_non_lang = retval
  parsed = blib.parse_text(secbody)
  head = None
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]:
      if head:
        pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" %
            (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims)))
        return
      head = t
  if not head:
    pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims)))
    return
  orig_adjs_or_dims = blib.fetch_param_chain(head, param, param)
  adjs_or_dims = blib.fetch_param_chain(head, param, param)
  added_adjs_or_dims = []
  for adj_or_dim in new_adj_or_dims:
    if adj_or_dim in adjs_or_dims:
      pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head)))
    else:
      adjs_or_dims.append(adj_or_dim)
      added_adjs_or_dims.append(adj_or_dim)
  if adjs_or_dims != orig_adjs_or_dims:
    orighead = unicode(head)
    blib.set_param_chain(head, adjs_or_dims, param, param)
    pagemsg("Replaced %s with %s" % (orighead, unicode(head)))
    notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims)))
    secbody = unicode(parsed)
  subsecs = re.split("(^==.*==\n)", secbody, 0, re.M)
  for k in xrange(2, len(subsecs), 2):
    if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]:
      header = re.sub("=", "", subsecs[k - 1]).strip()
      for adj_or_dim in adjs_or_dims:
        def note_removed_text(m):
          if m.group(1):
            pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" %
                (header, desc, adj_or_dim, m.group(0)))
          return ""
        newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k])
        if newsubsecsk != subsecs[k]:
          notes.append("remove %s %s from %s" % (desc, adj_or_dim, header))
        subsecs[k] = newsubsecsk
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        # Repeat in case adjacent terms removed (unlikely though).
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M)
      if re.search(r"^\s*$", subsecs[k]):
        subsecs[k] = ""
        subsecs[k - 1] = ""
  secbody = "".join(subsecs)
  secj = secbody + sectail
  newsecj = re.sub(r"\n\n\n+", "\n\n", secj)
  if newsecj != secj and not notes:
    notes.append("eliminate sequences of 3 or more newlines")
  secj = newsecj
  sections[j] = secj
  return "".join(sections), notes
Exemplo n.º 26
0
def process_page(index, page, save, verbose, adverbs, all_derived_lemmas):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    # ending and whether final consonant is palatal
    endings = [
        (u"ывать", False),
        (u"ивать", False),
        (u"ать", False),
        (u"ять", True),
        (u"еть", True),
        (u"ить", True),
        (u"нуть", False),
        (u"ия", True),
        (u"ие", True),
        (u"я", True),
        (u"е", True),
        (u"ь", True),
        (u"и", True),
        (u"а", False),
        (u"о", False),
        (u"ы", False),
        (u"ый", False),
        (u"ий", True),
        (u"ой", False),
    ]
    stems = []
    for ending, is_palatal in endings:
        if pagetitle.endswith(ending):
            stem = re.sub(ending + "$", "", pagetitle)
            stems.append((stem, is_palatal))
    if not stems:
        stems.append((pagetitle, False))
    possible = []

    def append_possible(stem_to_try, suffix):
        possible.append((stem_to_try.lower() + suffix, suffix))

    # Try -ный/-ной, -ка, -ко
    for stem, palatal in stems:
        stems_to_try = []

        def frob(stem):
            stem = first_palatalization(stem)
            if stem.endswith(u"л"):
                stem += u"ь"
            if re.search("[" + rulib.vowel + "]$", stem):
                stem += u"й"
            return stem

        to_try_1 = frob(stem)
        to_try_2 = rulib.dereduce_stem(stem, False)
        if to_try_2:
            to_try_2 = frob(rulib.remove_accents(to_try_2))
        to_try_3 = rulib.dereduce_stem(stem, True)
        if to_try_3:
            to_try_3 = frob(rulib.remove_accents(to_try_3))
        stems_to_try.append(to_try_1)
        if to_try_2:
            stems_to_try.append(to_try_2)
        if to_try_3 and to_try_3 != to_try_2:
            stems_to_try.append(to_try_3)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"ный")
            append_possible(stem_to_try, u"ной")
            append_possible(stem_to_try, u"ский")
            append_possible(stem_to_try, u"ской")
            append_possible(stem_to_try, u"ник")
            append_possible(stem_to_try, u"чик")
            append_possible(stem_to_try, u"щик")
            append_possible(stem_to_try, u"ка")
            append_possible(stem_to_try, u"ко")
            append_possible(stem_to_try, u"ство")
    # Try -овый/-евый/-ёвый/-овой/-евой, -ик, -ок/-ек/-ёк
    for stem, palatal in stems:
        stems_to_try = []
        stems_to_try.append(stem)
        reduced = rulib.reduce_stem(stem)
        if reduced:
            stems_to_try.append(reduced)
        for stem_to_try in stems_to_try:
            if stem_to_try.endswith(u"й"):
                stem_to_try = stem_to_try[:-1]
            append_possible(stem_to_try, u"овый")
            append_possible(stem_to_try, u"евый")
            append_possible(stem_to_try, u"ёвый")
            append_possible(stem_to_try, u"овой")
            append_possible(stem_to_try, u"евой")
            stem_to_try = first_palatalization(stem_to_try)
            append_possible(stem_to_try, u"еский")
            append_possible(stem_to_try, u"ический")
            append_possible(stem_to_try, u"ество")
            append_possible(stem_to_try, u"ик")
            append_possible(stem_to_try, u"ок")
            append_possible(stem_to_try, u"ек")
            append_possible(stem_to_try, u"ёк")
            append_possible(stem_to_try, u"ец")
    # If derived adverbs, try -о, -е, -и
    if adverbs:
        for stem, palatal in stems:
            stems_to_try = []
            stems_to_try.append(stem)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"о")
            append_possible(stem_to_try, u"е")
            append_possible(stem_to_try, u"и")

    would_output = False
    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            would_output = True
    if not would_output:
        return

    text = unicode(page.text)

    if rulib.check_for_alt_yo_terms(text, pagemsg):
        return

    base_lemmas = []

    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            derived_section = blib.find_lang_section(possible_derived,
                                                     "Russian", pagemsg,
                                                     errandpagemsg)
            if not derived_section:
                errandpagemsg(
                    "WARNING: Couldn't find Russian section for derived term %s"
                    % possible_derived)
                continue
            if "==Etymology" in derived_section:
                pagemsg(
                    "Skipping derived term %s because it already has an etymology"
                    % possible_derived)
                continue
            derived_defns = rulib.find_defns(derived_section)
            if not derived_defns:
                errandpagemsg(
                    "WARNING: Couldn't find definitions for derived term %s" %
                    possible_derived)
                continue

            derived_parsed = blib.parse_text(derived_section)
            derived_lemmas = find_noun_lemmas(
                derived_parsed, possible_derived,
                errandpagemsg, lambda tempcall: blib.expand_text(
                    tempcall, possible_derived, pagemsg, verbose))
            for t in derived_parsed.filter_templates():
                if tname(t) in ["ru-adj", "ru-adv"]:
                    lemmas = blib.fetch_param_chain(t, "1", "head",
                                                    possible_derived)
                    trs = blib.fetch_param_chain(t, "tr", "tr")
                    if trs:
                        lemmas = [
                            "%s//%s" % (lemma, tr)
                            for lemma, tr in zip(lemmas, trs)
                        ]
                    for lemma in lemmas:
                        add_if_not(derived_lemmas, lemma)

            if not derived_lemmas:
                errandpagemsg("WARNING: No derived term lemmas for %s" %
                              possible_derived)
                return

            if not base_lemmas:
                base_parsed = blib.parse_text(text)
                base_lemmas = find_noun_lemmas(base_parsed, pagetitle,
                                               errandpagemsg, expand_text)

                for t in base_parsed.filter_templates():
                    if tname(t) in ["ru-verb", "ru-adj"]:
                        lemmas = blib.fetch_param_chain(
                            t, "1", "head", pagetitle)
                        trs = blib.fetch_param_chain(t, "tr", "tr")
                        if trs:
                            lemmas = [
                                "%s//%s" % (lemma, tr)
                                for lemma, tr in zip(lemmas, trs)
                            ]
                        for lemma in lemmas:
                            add_if_not(base_lemmas, lemma)

                if not base_lemmas:
                    errandpagemsg("WARNING: No base lemmas")
                    return

                base_lemmas = [
                    rulib.remove_monosyllabic_accents(x) for x in base_lemmas
                ]

                warnings = []
                if len(base_lemmas) > 1:
                    warnings.append("multiple-lemmas")
                if any("//" in lemma for lemma in base_lemmas):
                    warnings.append("translit-in-lemma")

                base_section = blib.find_lang_section_from_text(
                    text, "Russian", pagemsg)
                if not base_section:
                    errandpagemsg(
                        "WARNING: Couldn't find Russian section for base")
                    return

                base_defns = rulib.find_defns(base_section)
                if not base_defns:
                    errandpagemsg(
                        "WARNING: Couldn't find definitions for base")
                    return

            def concat_defns(defns):
                return ";".join(defns).replace("_", r"\u").replace(" ", "_")

            suffixes_with_stress = []
            for suf in [
                    suffix,
                    rulib.make_beginning_stressed_ru(suffix),
                    rulib.make_ending_stressed_ru(suffix)
            ]:
                for derived_lemma in derived_lemmas:
                    if derived_lemma.endswith(suf):
                        add_if_not(suffixes_with_stress, suf)
            msg("%s %s+-%s%s no-etym possible-suffixed %s //// %s" %
                (",".join(derived_lemmas), ",".join(base_lemmas),
                 ",".join(suffixes_with_stress),
                 " WARNING:%s" % ",".join(warnings) if warnings else "",
                 concat_defns(base_defns), concat_defns(derived_defns)))
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = ru.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword and (not lemmas or pagetitle in lemmas):
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runoun.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runoun.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Exemplo n.º 28
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    heads = None
    plurale_tantum = False
    animacy = "unknown"
    gender = "unknown"
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["be-noun", "be-proper noun"]:
            heads = blib.fetch_param_chain(t, "1", "head")
            gender_and_animacy = blib.fetch_param_chain(t, "2", "g")
            plurale_tantum = False
            animacy = []
            gender = []
            if gender_and_animacy:
                for ga in gender_and_animacy:
                    gender_and_animacy_parts = ga.split("-")
                    g = gender_and_animacy_parts[0]
                    if g not in gender:
                        gender.append(g)
                    if len(gender_and_animacy_parts) > 1:
                        a = gender_and_animacy_parts[1]
                        if a not in animacy:
                            animacy.append(a)
                    if len(gender_and_animacy_parts
                           ) > 2 and gender_and_animacy_parts[2] == "p":
                        plurale_tantum = True
            if not animacy:
                animacy = "unknown"
            elif len(animacy) > 1:
                pagemsg("WARNING: Multiple animacies: %s" % ",".join(animacy))
            animacy = animacy[0]
            if not gender:
                gender = "unknown"
            elif set(gender) == {"m", "f"}:
                gender = "MF"
            else:
                if len(gender) > 1:
                    pagemsg("WARNING: Multiple genders: %s" % ",".join(gender))
                gender = gender[0]
                if gender in ["m", "f", "n"]:
                    gender = gender.upper()
                else:
                    pagemsg("WARNING: Unknown gender: %s" % gender)
                    gender = "unknown"

        def fetch(param):
            val = getparam(t, param).strip()
            val = blib.remove_links(val)
            vals = re.split(r",\s*", val)
            retval = []
            for v in vals:
                # Remove final footnote symbols are per [[Module:table tools]]
                v = re.sub(
                    ur"[*~@#$%^&+0-9_\u00A1-\u00BF\u00D7\u00F7\u2010-\u2027\u2030-\u205E\u2070-\u20CF\u2100-\u2B5F\u2E00-\u2E3F]*$",
                    "", v)
                v = be.mark_stressed_vowels_in_unstressed_syllables(v, pagemsg)
                retval.append(be.add_monosyllabic_accent(v))
            return ", ".join(retval)

        def matches(is_end_stressed, should_be_end_stressed):
            return (is_end_stressed == "mixed"
                    or should_be_end_stressed is None
                    or is_end_stressed == should_be_end_stressed)

        def fetch_endings(param, endings):
            paramval = fetch(param)
            values = re.split(", *", paramval)
            found_endings = []
            for v in values:
                v = v.replace(be.AC, "")
                for ending in endings:
                    if v.endswith(ending):
                        found_endings.append(ending)
                        break
                else:  # no break
                    pagemsg(
                        "WARNING: Couldn't recognize ending for %s=%s: %s" %
                        (param, paramval, unicode(t)))
            return ":".join(found_endings)

        def canon(val):
            values = re.split(", *", val)
            return "/".join(
                be.undo_mark_stressed_vowels_in_unstressed_syllables(v)
                for v in values)

        def stress(endstressed):
            return ("endstressed" if endstressed == True else
                    "stemstressed" if endstressed == False else "mixed")

        def check_multi_stressed(maxparam):
            for i in xrange(1, maxparam + 1):
                val = getparam(t, str(i))
                vals = re.split(r",\s*", val)
                for v in vals:
                    if be.is_multi_stressed(v):
                        pagemsg(
                            "WARNING: Param %s=%s has multiple stresses: %s" %
                            ((str(i), val, unicode(t))))
                    if be.needs_accents(v):
                        pagemsg("WARNING: Param %s=%s has missing stress: %s" %
                                ((str(i), val, unicode(t))))

        def ins_sg_note(ins_sg):
            if re.search(u"[чшжрць]$", heads[0]) and gender == "f":
                return "ins_sg=%s " % canon(ins_sg)
            else:
                return ""

        def truncate_extra_forms(form):
            return re.sub(",.*", "", form)

        def infer_animacy(nom_pl, gen_pl, acc_pl):
            nom_pl_vals = set(nom_pl.split(", "))
            gen_pl_vals = set(gen_pl.split(", "))
            acc_pl_vals = set(acc_pl.split(", "))
            if acc_pl_vals == nom_pl_vals:
                return "in"
            elif acc_pl_vals == gen_pl_vals:
                return "an"
            else:
                pagemsg(
                    "WARNING: Can't infer animacy: nom_pl=%s, gen_pl=%s, acc_pl=%s"
                    % (nom_pl, gen_pl, acc_pl))
                return "unknown"

        def infer_gender(lemma):
            if re.search(u"[оеё]́?$", lemma) or re.search(u"мя́?$", lemma):
                return "N"
            elif re.search(u"[цс]тва$", lemma):
                return "N"
            elif re.search(u"[ая]́?$", lemma) or re.search(u"асць$", lemma):
                return "F"
            elif re.search(u"ь$", lemma):
                return None
            elif re.search(be.cons_c + "$", lemma):
                return "M"
            else:
                pagemsg("WARNING: Unrecognized lemma ending: %s" % lemma)
                return None

        def default_stress(lemma, gender, reducible):
            if re.search(u"я́$", lemma) and gender == "N":
                return "b"
            elif re.search(AC + "$", lemma):
                return "d"
            elif "*" in reducible and re.search(
                    u"[еоэаё]́" + be.cons_c + u"ь?$", lemma):
                return "b"
            else:
                return "a"

        def infer_alternations(nom_sg, nom_pl):
            nom_sg = truncate_extra_forms(nom_sg)
            nom_pl = truncate_extra_forms(nom_pl)
            if re.search(u"^.*[аяеёо]́$", nom_sg):
                m = re.search(u"^(.*)[ыіая]$", nom_pl)
                if m:
                    pl_stem = m.group(1)
                    for valt in possible_vowel_alternations:
                        valt_nom_sg = be.apply_vowel_alternation(nom_sg, valt)
                        if valt_nom_sg:
                            valt_nom_sg = re.sub(u"[аяеёо]́$", "", valt_nom_sg)
                            valt_nom_sg = be.maybe_accent_final_syllable(
                                valt_nom_sg)
                            valt_nom_sg = be.destress_vowels_after_stress_movement(
                                valt_nom_sg)
                            if valt_nom_sg == be.undo_mark_stressed_vowels_in_unstressed_syllables(
                                    pl_stem):
                                return valt
            m = re.search(u"^(.*" + be.cons_c + u")ь?$", nom_sg)
            if m:
                nom_sg = m.group(1)
                nom_sg = re.sub(u"й$", "", nom_sg)
                if re.search(
                        u"я" + be.cons_c + "*" + be.vowel_c + AC + be.cons_c +
                        "*$", nom_sg):
                    nom_sg = be.apply_vowel_alternation(nom_sg, "ae")
                    m = re.search(u"^.*([ыіая]́)$", nom_pl)
                    if m:
                        nom_sg = be.remove_accents(nom_sg) + m.group(1)
                        nom_sg = be.destress_vowels_after_stress_movement(
                            nom_sg)
                        if nom_sg == be.undo_mark_stressed_vowels_in_unstressed_syllables(
                                nom_pl):
                            return "ae"
            return None

        def vowel_stem_from_vowel_ending_nom_sg(nom_sg):
            m = re.search(u"^(.*)[аяеоё]́?$", nom_sg)
            assert m
            vowel_stem = m.group(1)
            if re.search(be.vowel_c + AC + "?$", vowel_stem):
                vowel_stem += u"й"
            return vowel_stem

        def compare_stems(marked_stem, unmarked_stem):
            return (be.destress_vowels_after_stress_movement(marked_stem) ==
                    be.undo_mark_stressed_vowels_in_unstressed_syllables(
                        unmarked_stem))

        def infer_reducible(nom_sg, gen_sg, gen_pl, gender, seen_patterns):
            if len(seen_patterns) > 1:
                pagemsg(
                    "WARNING: Multiple patterns %s, not inferring reducible" %
                    ",".join(seen_patterns))
                return None
            if len(seen_patterns) == 0:
                pagemsg("WARNING: No patterns, not inferring reducible")
                return None
            seen_pattern = seen_patterns[0]
            nom_sg = truncate_extra_forms(nom_sg)
            gen_sg = truncate_extra_forms(gen_sg)
            gen_pls = gen_pl and re.split(", *", gen_pl) or []
            if re.search(u"[аяеоё]́?$", nom_sg):
                epenthetic_stress = seen_pattern in ["b", "c", "e", "f"]
                vowel_stem = vowel_stem_from_vowel_ending_nom_sg(nom_sg)
                if seen_pattern in ["b", "d"]:
                    vowel_stem = be.maybe_accent_final_syllable(vowel_stem)
                else:
                    vowel_stem = be.maybe_accent_initial_syllable(vowel_stem)
                retvals = []
                for gen_pl in gen_pls:
                    nonvowel_stem = re.sub(u"ў$", u"в",
                                           re.sub(u"ь$", "", gen_pl))
                    if compare_stems(vowel_stem, nonvowel_stem):
                        retvals.append("(-)" if gender == "N" else "")
                        continue
                    if compare_stems(
                            be.dereduce(vowel_stem, epenthetic_stress) or "",
                            nonvowel_stem):
                        retvals.append("*(-)" if gender == "N" else "*")
                        continue
                    if compare_stems(
                            be.dereduce(vowel_stem, not epenthetic_stress)
                            or "", nonvowel_stem):
                        retvals.append("*#(-)" if gender == "N" else "*#")
                        continue
                    if (compare_stems(vowel_stem + u"ав", nonvowel_stem) or
                            compare_stems(vowel_stem + u"яв", nonvowel_stem)):
                        if epenthetic_stress:
                            retvals.append("#" if gender == "N" else u"#(ў)")
                        else:
                            retvals.append("" if gender == "N" else u"(ў)")
                        continue
                    if (compare_stems(
                            be.remove_accents(vowel_stem) + u"о́в",
                            nonvowel_stem) or compare_stems(
                                be.remove_accents(vowel_stem) + u"ё́в",
                                nonvowel_stem)):
                        if epenthetic_stress:
                            retvals.append("" if gender == "N" else u"(ў)")
                        else:
                            retvals.append("#" if gender == "N" else u"#(ў)")
                        continue
                    #for valt in possible_vowel_alternations:
                    #  valt_nom_sg = be.apply_vowel_alternation(nom_sg, valt)
                    #  if valt_nom_sg:
                    #    valt_vowel_stem = vowel_stem_from_vowel_ending_nom_sg(valt_nom_sg)
                    #    if be.remove_accents(valt_vowel_stem) == be.remove_accents(nonvowel_stem):
                    #      retvals.append("(-)" if gender == "N" else "")
                    #      break
                    #    if (be.remove_accents(valt_vowel_stem) + u"ав" == be.remove_accents(nonvowel_stem) or
                    #        be.remove_accents(valt_vowel_stem) + u"яв" == be.remove_accents(nonvowel_stem)):
                    #      retvals.append("" if gender == "N" else u"(ў)")
                    #      break
                    #else: # no break
                    pagemsg(
                        "WARNING: Unable to determine relationship between nom_sg %s and gen_pl %s"
                        % (nom_sg, gen_pl))
                return ",".join(retvals)
            else:
                orig_nom_sg = nom_sg
                nonvowel_stem = re.sub(u"ь$", "", nom_sg)
                vowel_stem = re.sub(u"в$", u"ў",
                                    re.sub(u"[аяуюыі]́?$", "", gen_sg))
                if re.search(be.vowel_c + AC + "?$", vowel_stem):
                    vowel_stem += u"й"
                if compare_stems(be.reduce(nonvowel_stem) or "", vowel_stem):
                    return "*"
                nom_sg = re.sub(u"[йь]$", "", nom_sg)
                nom_sg = re.sub(u"ў$", u"в", nom_sg)
                m = re.search(u"([аяуюыі]́?)$", gen_sg)
                if not m:
                    pagemsg(
                        "WARNING: Unrecognized genitive singular ending: %s" %
                        gen_sg)
                    return None
                ending = m.group(1)
                if be.is_accented(ending):
                    nom_sg = be.remove_accents(nom_sg)
                nom_sg += ending
                if (be.destress_vowels_after_stress_movement(nom_sg) ==
                        be.undo_mark_stressed_vowels_in_unstressed_syllables(
                            gen_sg)):
                    return ""
                pagemsg(
                    "WARNING: Unable to determine relationship between nom_sg %s and gen_sg %s"
                    % (orig_nom_sg, gen_sg))
                return None

        def construct_defaulted_seen_patterns(seen_patterns, lemma, gender,
                                              reducible):
            defaulted_seen_patterns = []
            if seen_patterns == ["b", "c"]:
                seen_patterns = ["c", "b"]
            elif seen_patterns == ["b", "d"]:
                seen_patterns = ["d", "b"]
            if len(seen_patterns) > 1 and "," in reducible:
                pagemsg(
                    "WARNING: Multiple accent patterns %s and reducible specs %s, not taking Cartesian product"
                    % (",".join(seen_patterns), reducible))
                reducible = ""
            for pattern in seen_patterns:
                for red in reducible.split(","):
                    defstress = default_stress(lemma, gender, red)
                    if defstress == pattern:
                        if len(seen_patterns) > 1:
                            defaulted_seen_patterns.append(pattern + red)
                        else:
                            defaulted_seen_patterns.append(red)
                    else:
                        defaulted_seen_patterns.append(pattern + red)
            return ",".join(defaulted_seen_patterns)

        if tn == "be-decl-noun":
            check_multi_stressed(14)
            nom_sg = fetch("1")
            gen_sg = fetch("3")
            gen_sg_end_stressed = param_is_end_accented(gen_sg)
            dat_sg = fetch("5")
            dat_sg_end_stressed = param_is_end_accented(
                dat_sg, dative_singular_endings)
            acc_sg = fetch("7")
            acc_sg_end_stressed = param_is_end_accented(acc_sg)
            ins_sg = fetch("9")
            ins_sg_end_stressed = param_is_end_accented(
                ins_sg, instrumental_singular_endings)
            loc_sg = fetch("11")
            loc_sg_end_stressed = param_is_end_accented(
                loc_sg, locative_singular_endings)
            nom_pl = fetch("2")
            nom_pl_end_stressed = param_is_end_accented(nom_pl)
            gen_pl = fetch("4")
            gen_pl_end_stressed = param_is_end_accented(gen_pl)
            acc_pl = fetch("8")
            acc_pl_end_stressed = param_is_end_accented(acc_pl)
            ins_pl = fetch("10")
            ins_pl_end_stressed = param_is_end_accented(
                ins_pl, instrumental_plural_endings)
            loc_pl = fetch("12")
            loc_pl_end_stressed = param_is_end_accented(loc_pl)
            if (gen_sg_end_stressed == "unknown"
                    or acc_sg_end_stressed == "unknown"
                    or nom_pl_end_stressed == "unknown"
                    or loc_pl_end_stressed == "unknown"):
                pagemsg(
                    "WARNING: Missing stresses, can't determine accent pattern: %s"
                    % unicode(t))
                continue
            seen_patterns = []
            for pattern, accents in accent_patterns:
                if (matches(ins_sg_end_stressed, accents["inssg"])
                        and matches(acc_sg_end_stressed, accents["accsg"])
                        and matches(nom_pl_end_stressed, accents["nompl"])
                        and matches(loc_pl_end_stressed, accents["locpl"])):
                    seen_patterns.append(pattern)
            if "a" in seen_patterns and "b" in seen_patterns:
                # If a and b apply, most others can apply as well
                seen_patterns = ["a", "b"]
            elif "a" in seen_patterns and "c" in seen_patterns:
                # If a and c apply, e can apply as well
                seen_patterns = ["a", "c"]
            elif "a" in seen_patterns and "d" in seen_patterns:
                # If a and d apply, d' can apply as well
                seen_patterns = ["a", "d"]
            elif "b" in seen_patterns and "d" in seen_patterns:
                # If b and d apply, f can apply as well
                seen_patterns = ["b", "d"]
            gen_sg_endings = fetch_endings("3", genitive_singular_endings)
            dat_sg_endings = fetch_endings("5", dative_singular_endings)
            ins_sg_endings = fetch_endings("9", instrumental_singular_endings)
            loc_sg_endings = fetch_endings("11", locative_singular_endings)
            nom_pl_endings = fetch_endings("2", nominative_plural_endings)
            gen_pl_endings = fetch_endings("4", genitive_plural_endings)

            if not heads:
                pagemsg("WARNING: No head found")
                heads = [pagetitle]
            pagemsg(
                "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tgen_pl:%s\tnumber:both\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tnom_pl:%s\tgen_pl:%s\t| %s || \"?\" || %s || %s || %s || %s || %s || %s|| "
                % ("/".join(heads), gender, animacy, ":".join(seen_patterns),
                   stress(gen_sg_end_stressed), stress(dat_sg_end_stressed),
                   stress(loc_sg_end_stressed), stress(gen_pl_end_stressed),
                   gen_sg_endings, dat_sg_endings, loc_sg_endings,
                   nom_pl_endings, gen_pl_endings, canon(nom_sg),
                   canon(gen_sg), canon(loc_sg), canon(nom_pl), canon(gen_pl),
                   canon(ins_pl), ins_sg_note(ins_sg)))
            if len(heads) > 1:
                pagemsg(
                    "WARNING: Multiple heads, not inferring declension: %s" %
                    ",".join(heads))
                continue
            if gender == "unknown" or animacy == "unknown":
                pagemsg(
                    "WARNING: Unknown gender or animacy, not inferring declension"
                )
                continue
            defan = infer_animacy(nom_pl, gen_pl, acc_pl)
            if not (defan == "in" and animacy == "in"
                    or defan == "an" and animacy in ["pr", "anml"]):
                pagemsg(
                    "WARNING: Inferred animacy %s != explicit animacy %s, not inferring declension"
                    % (defan, animacy))
                continue
            lemma = heads[0]
            parts = []
            defg = infer_gender(lemma)
            if gender != defg:
                parts.append(gender)
            alternation = infer_alternations(nom_sg, nom_pl)

            def apply_alternations(form):
                forms = re.split(", *", form)
                forms = [
                    be.apply_vowel_alternation(form, alternation) or form
                    for form in forms
                ]
                return ", ".join(forms)

            nom_sg = apply_alternations(nom_sg)
            reducible = infer_reducible(nom_sg, gen_sg, gen_pl, gender,
                                        seen_patterns) or ""
            defaulted_seen_patterns = construct_defaulted_seen_patterns(
                seen_patterns, lemma, gender, reducible)
            if defaulted_seen_patterns:
                parts.append(defaulted_seen_patterns)
            if animacy != "in":
                parts.append(animacy)
            if alternation in ["ae", "ao", "yo"]:
                parts.append(alternation)
            if gender == "M":
                if re.search(u"у́?$", gen_sg):
                    parts.append("genu")
                elif re.search(u"ю́?$", gen_sg):
                    parts.append("genju")
            pagemsg("Inferred declension %s<%s>" % (lemma, ".".join(parts)))

        elif tn == "be-decl-noun-unc":
            check_multi_stressed(7)
            nom_sg = fetch("1")
            gen_sg = fetch("2")
            gen_sg_end_stressed = param_is_end_accented(gen_sg)
            dat_sg = fetch("3")
            dat_sg_end_stressed = param_is_end_accented(
                dat_sg, dative_singular_endings)
            acc_sg = fetch("4")
            acc_sg_end_stressed = param_is_end_accented(acc_sg)
            ins_sg = fetch("5")
            ins_sg_end_stressed = param_is_end_accented(
                ins_sg, instrumental_singular_endings)
            loc_sg = fetch("6")
            loc_sg_end_stressed = param_is_end_accented(
                loc_sg, locative_singular_endings)
            if (gen_sg_end_stressed == "unknown"
                    or acc_sg_end_stressed == "unknown"):
                pagemsg(
                    "WARNING: Missing stresses, can't determine accent pattern: %s"
                    % unicode(t))
                continue
            if not heads:
                pagemsg("WARNING: No head found")
                heads = [pagetitle]
            lemma = heads[0]
            seen_patterns = []
            for pattern, accents in accent_patterns:
                if pattern not in [
                        "a", "d" if re.search(u"[аяеёо]́?$", lemma) else "b"
                ]:
                    continue
                if (matches(ins_sg_end_stressed, accents["inssg"])
                        and matches(acc_sg_end_stressed, accents["accsg"])):
                    seen_patterns.append(pattern)
            if "a" in seen_patterns and "b" in seen_patterns:
                seen_patterns = ["a", "b"]
            if "a" in seen_patterns and "d" in seen_patterns:
                seen_patterns = ["a", "d"]
            gen_sg_endings = fetch_endings("2", genitive_singular_endings)
            dat_sg_endings = fetch_endings("3", dative_singular_endings)
            ins_sg_endings = fetch_endings("5", instrumental_singular_endings)
            loc_sg_endings = fetch_endings("6", locative_singular_endings)

            pagemsg(
                "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tgen_pl:-\tnumber:sg\tgen_sg:%s\tdat_sg:%s\tloc_sg:%s\tnom_pl:-\tgen_pl:-\t| %s || \"?\" || %s || %s || - || - || - || %s|| "
                % ("/".join(heads), gender, animacy, ":".join(seen_patterns),
                   stress(gen_sg_end_stressed), stress(dat_sg_end_stressed),
                   stress(loc_sg_end_stressed), gen_sg_endings, dat_sg_endings,
                   loc_sg_endings, canon(nom_sg), canon(gen_sg), canon(loc_sg),
                   ins_sg_note(ins_sg)))

            if len(heads) > 1:
                pagemsg(
                    "WARNING: Multiple heads, not inferring declension: %s" %
                    ",".join(heads))
                continue
            if gender == "unknown" or animacy == "unknown":
                pagemsg(
                    "WARNING: Unknown gender or animacy, not inferring declension"
                )
                continue
            parts = []
            defg = infer_gender(lemma)
            if gender != defg:
                parts.append(gender)
            reducible = infer_reducible(nom_sg, gen_sg, None, gender,
                                        seen_patterns) or ""
            defaulted_seen_patterns = construct_defaulted_seen_patterns(
                seen_patterns, lemma, gender, reducible)
            if defaulted_seen_patterns:
                parts.append(defaulted_seen_patterns)
            if animacy != "in":
                parts.append(animacy)
            parts.append("sg")
            if gender == "M" and re.search("^" + be.uppercase_c, lemma):
                if re.search(u"у́?$", gen_sg):
                    parts.append("genu")
                elif re.search(u"ю́?$", gen_sg):
                    parts.append("genju")
            pagemsg("Inferred declension %s<%s>" % (lemma, ".".join(parts)))

        elif tn == "be-decl-noun-pl":
            check_multi_stressed(7)
            nom_pl = fetch("1")
            nom_pl_end_stressed = param_is_end_accented(nom_pl)
            gen_pl = fetch("2")
            gen_pl_end_stressed = param_is_end_accented(gen_pl)
            ins_pl = fetch("5")
            ins_pl_end_stressed = param_is_end_accented(
                ins_pl, instrumental_plural_endings)
            loc_pl = fetch("6")
            loc_pl_end_stressed = param_is_end_accented(loc_pl)
            if (nom_pl_end_stressed == "unknown"
                    or loc_pl_end_stressed == "unknown"):
                pagemsg(
                    "WARNING: Missing stresses, can't determine accent pattern: %s"
                    % unicode(t))
                continue
            seen_patterns = []
            for pattern, accents in accent_patterns:
                if pattern not in ["a", "b", "e"]:
                    continue
                if (matches(nom_pl_end_stressed, accents["nompl"])
                        and matches(loc_pl_end_stressed, accents["locpl"])):
                    seen_patterns.append(pattern)
            if "a" in seen_patterns and "b" in seen_patterns:
                seen_patterns = ["a", "b"]
            nom_pl_endings = fetch_endings("1", nominative_plural_endings)
            gen_pl_endings = fetch_endings("2", genitive_plural_endings)

            if not heads:
                pagemsg("WARNING: No head found")
                heads = [pagetitle]
            pagemsg(
                "%s\tgender:%s\tanimacy:%s\taccent:%s\tgen_sg:-\tdat_sg:-\tloc_sg:-\tgen_pl:%s\tnumber:pl\tgen_sg:-\tdat_sg:-\tloc_sg:-\tnom_pl:%s\tgen_pl:%s\t| %s || \"?\" || - || - || %s || %s || %s || || "
                % ("/".join(heads), gender, animacy, ":".join(seen_patterns),
                   stress(gen_pl_end_stressed), nom_pl_endings, gen_pl_endings,
                   canon(nom_pl), canon(nom_pl), canon(gen_pl), canon(ins_pl)))
Exemplo n.º 29
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  notes = []

  if "sa-noun" not in text and "sa-decl-noun" not in text:
    return

  if ":" in pagetitle:
    pagemsg("Skipping non-mainspace title")
    return

  pagemsg("Processing")

  parsed = blib.parse_text(text)

  headt = None
  saw_decl = False

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)

    if tn == "sa-noun":
      pagemsg("Saw headt=%s" % unicode(t))
      if headt and not saw_decl:
        pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t)))
      headt = t
      saw_decl = False
      continue

    if tn in ["sa-decl-noun", "sa-decl"]:
      pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None))
      continue

    if tn.startswith("sa-decl-noun-"):
      pagemsg("Saw declt=%s" % unicode(t))
      if not headt:
        pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t)))
        continue
      saw_decl = True

      tr = getparam(headt, "tr")
      accented_tr = False
      if not tr:
        tr = expand_text("{{xlit|sa|%s}}" % pagetitle)
        pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t)))
      else:
        if "-" in tr:
          pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
          tr = tr.replace("-", "")
        decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś")
        if AC not in decomptr and GR not in decomptr:
          pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
        else:
          accented_tr = True
      genders = blib.fetch_param_chain(headt, "g")
      genders = [g.replace("-p", "").replace("bysense", "") for g in genders]
      genders = [g for gs in genders for g in (
        ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs]
      )]

      if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]:
        tg = tn[-1]
        if tg not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            tg, ",".join(genders), unicode(headt), unicode(t)))
          continue

        decltr = getparam(t, "1")
        if not decltr:
          if not accented_tr:
            pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn))
          else:
            pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add accented translit %s to {{%s}}" % (tr, tn))
        elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari
          if not accented_tr:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn))
          else:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn))
        else:
          decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś")
          subbed = False
          if AC not in decompdecltr and GR not in decompdecltr:
            if accented_tr:
              pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" %
                  (decltr, tr, unicode(headt), unicode(t)))
              t.add("1", tr)
              notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn))
              subbed = True
            else:
              pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" %
                  (decltr, unicode(headt), unicode(t)))
          if not subbed and "-" in decltr:
            pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn))
            decltr = decltr.replace("-", "")
            t.add("1", decltr)
            subbed = True
          stripped_decltr = decltr.strip()
          if "\n" not in decltr and stripped_decltr != decltr:
            pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn))
            decltr = stripped_decltr
            t.add("1", decltr)
            subbed = True
        continue

      if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"):
        pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t)))
        continue

      if tn in old_template_to_gender:
        must_continue = False
        for param in t.params:
          pn = pname(param)
          if pn not in ["1", "2", "3", "4", "n"]:
            pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t),
              unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue

        g = old_template_to_gender[tn]
        if g not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            g, ",".join(genders), unicode(headt), unicode(t)))
          continue

        blib.set_template_name(t, "sa-decl-noun-%s" % g)
        rmparam(t, "n")
        rmparam(t, "4")
        rmparam(t, "3")
        rmparam(t, "2")
        t.add("1", tr)
        notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g))
      else:
        pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t))

    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  if headt:
    pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt))

  return unicode(parsed), notes
Exemplo n.º 30
0
    if not headword_template:
        pagemsg("WARNING: Can't find headword template, skipping")
        return

    pagemsg("Found headword template: %s" % unicode(headword_template))

    headword_is_proper = unicode(headword_template.name) == "ru-proper noun"

    if getparam(
            headword_template, "3"
    ) == "-" or "[[Category:Russian indeclinable nouns]]" in page.text:
        pagemsg("WARNING: Indeclinable noun, skipping")
        return

    headword_trs = blib.fetch_param_chain(headword_template, "tr", "tr")
    if headword_trs:
        pagemsg("WARNING: Found headword manual translit, skipping: %s" %
                ",".join(headword_trs))
        return

    headword = getparam(headword_template, "1")
    for badparam in ["head2", "gen2", "pl2"]:
        val = getparam(headword_template, badparam)
        if val:
            pagemsg(
                "WARNING: Found extra param, can't handle, skipping: %s=%s" %
                (badparam, val))
            return

    # Here we use a capturing split, and treat what we want to capture as
Exemplo n.º 31
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if old_adj_template not in text and "es-noun" not in text:
        return

    if ":" in pagetitle:
        pagemsg("Skipping non-mainspace title")
        return

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "es-noun" and args.remove_redundant_noun_args:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            if not getparam(t, "2") and (getparam(t, "pl2")
                                         or getparam(t, "pl3")):
                pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" %
                        unicode(t))
                continue
            g = getparam(t, "1")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in m=%s and old default noun algorithm applying"
                    % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in f=%s and old default noun algorithm applying"
                    % ",".join(fs))
            pls = blib.fetch_param_chain(t, "2", "pl")
            if not pls and not g.endswith("-p"):
                if " " in lemma:
                    pagemsg(
                        "WARNING: Space in headword and old default noun algorithm applying"
                    )
                continue
            pls_with_def = []
            defpl = make_plural(lemma)
            if not defpl:
                continue
            if len(defpl) > 1:
                if set(pls) == set(defpl):
                    pls_with_def = ["+"]
                elif set(pls) < set(defpl):
                    pagemsg(
                        "WARNING: pls=%s subset of defpls=%s, replacing with default"
                        % (",".join(pls), ",".join(defpl)))
                    pls_with_def = ["+"]
                else:
                    pls_with_def = pls
            else:
                for pl in pls:
                    if pl == defpl[0]:
                        pls_with_def.append("+")
                    else:
                        pls_with_def.append(pl)

            actual_special = None
            for special in all_specials:
                special_pl = make_plural(lemma, special)
                if special_pl is None:
                    continue
                if len(special_pl) > 1 and set(pls) < set(special_pl):
                    pagemsg(
                        "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing"
                        % (special, ",".join(pls), ",".join(special_pl)))
                    actual_special = special
                    break
                if set(pls) == set(special_pl):
                    pagemsg("Found special=%s with special_pl=%s" %
                            (special, ",".join(special_pl)))
                    actual_special = special
                    break

            if pls_with_def == ["+"]:
                notes.append("remove redundant plural%s %s from {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls)))
                blib.remove_param_chain(t, "2", "pl")
            elif actual_special:
                notes.append("replace plural%s %s with +%s in {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls),
                              actual_special))
                blib.set_param_chain(t, ["+" + actual_special], "2", "pl")
            elif pls_with_def != pls:
                notes.append(
                    "replace default plural %s with '+' in {{es-noun}}" %
                    ",".join(defpl))
                blib.set_param_chain(t, pls_with_def, "2", "pl")

            def handle_mf(mf, mf_full, make_mf):
                mfs = blib.fetch_param_chain(t, mf, mf)
                mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl")
                if mfs and not any(x.startswith("+") for x in mfs):
                    defmf = make_mf(lemma)
                    if set(mfs) == {defmf}:
                        defpls = make_plural(defmf)
                        ok = False
                        if not mfpls or set(mfpls) == set(defpls):
                            ok = True
                        elif set(mfpls) < set(defpls):
                            pagemsg(
                                "WARNING: %pl=%s subset of default=%s, allowing"
                                % (mf, ",".join(mfpls), ",".join(defpls)))
                            ok = True
                        if ok:
                            notes.append(
                                "replace %s=%s with '+' in {{es-noun}}" %
                                (mf, ",".join(mfs)))
                            blib.set_param_chain(t, ["+"], mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                            return
                    actual_special = None
                    for special in all_specials:
                        special_mf = make_mf(lemma, special)
                        if special_mf is None:
                            continue
                        if mfs == [special_mf]:
                            pagemsg("Found special=%s with special_mf=%s" %
                                    (special, special_mf))
                            actual_special = special
                            break
                    if actual_special:
                        if not mfpls:
                            pagemsg(
                                "WARNING: Explicit %s=%s matches special=%s but no %s plural"
                                % (mf, ",".join(mfs), actual_special, mf_full))
                        else:
                            special_mfpl = make_plural(special_mf,
                                                       actual_special)
                            if special_mfpl:
                                if len(special_mfpl) > 1 and set(mfpls) < set(
                                        special_mfpl):
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                elif set(mfpls) == set(special_mfpl):
                                    pagemsg(
                                        "Found %s=%s and special=%s, %spls=%s matches special_%spl"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf))
                                else:
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                    actual_special = None
                        if actual_special:
                            notes.append(
                                "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural"
                                % (mf_full, ",".join(mfs), actual_special,
                                   mf_full))
                            blib.set_param_chain(t, ["+%s" % actual_special],
                                                 mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                    if not actual_special:
                        defmf = make_mf(lemma)
                        mfs_with_def = ["+" if x == defmf else x for x in mfs]
                        if mfs_with_def != mfs:
                            notes.append(
                                "replace default %s %s with '+' in {{es-noun}}"
                                % (mf_full, defmf))
                            blib.set_param_chain(t, mfs_with_def, mf, mf)
                        if mfpls:
                            defpl = [
                                x for y in mfs for x in (make_plural(y) or [])
                            ]
                            ok = False
                            if set(defpl) == set(mfpls):
                                ok = True
                            elif len(defpl) > 1 and set(mfpls) < set(defpl):
                                pagemsg(
                                    "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing"
                                    % (mf, ",".join(mfs), mf, ",".join(mfpls),
                                       ",".join(defpl)))
                                ok = True
                            if ok:
                                pagemsg(
                                    "Found %s=%s, %spl=%s matches default pl" %
                                    (mf, ",".join(mfs), mf, ",".join(mfpls)))
                                notes.append(
                                    "remove redundant explicit %s plural %s in {{es-noun}}"
                                    % (mf_full, ",".join(mfpls)))
                                blib.remove_param_chain(
                                    t, mf + "pl", mf + "pl")
                            else:
                                for special in all_specials:
                                    defpl = [
                                        x for y in mfs for x in (
                                            make_plural(y, special) or [])
                                    ]
                                    if set(defpl) == set(mfpls):
                                        pagemsg(
                                            "Found %s=%s, %spl=%s matches special=%s"
                                            % (mf, ",".join(mfs), mf,
                                               ",".join(mfpls), special))
                                        notes.append(
                                            "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}"
                                            % (mf_full, ",".join(mfpls),
                                               special))
                                        blib.set_param_chain(
                                            t, ["+%s" % special], mf + "pl",
                                            mf + "pl")

            handle_mf("f", "feminine", make_feminine)
            handle_mf("m", "masculine", make_masculine)

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

        if tn == "es-noun" and args.make_multiword_plural_explicit:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)

            def expand_text(tempcall):
                return blib.expand_text(tempcall, pagetitle, pagemsg,
                                        args.verbose)

            if " " in lemma and not getparam(t, "2"):
                g = getparam(t, "1")
                if not g.endswith("-p"):
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" %
                        (lemma, g))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string"
                        )
                        continue
                    plurals = explicit_pl.split(",")
                    blib.set_param_chain(t, plurals, "2", "pl")
                    notes.append("add explicit plural to multiword noun")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls:
                mpls = []
                for m in ms:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" %
                        (blib.remove_links(m)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string"
                            % m)
                        continue
                    this_mpls = explicit_pl.split(",")
                    mpls.extend(this_mpls)
                blib.set_param_chain(t, mpls, "mpl", "mpl")
                notes.append("add explicit plural to m=%s" % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls:
                fpls = []
                for f in fs:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" %
                        (blib.remove_links(f)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string"
                            % f)
                        continue
                    this_fpls = explicit_pl.split(",")
                    fpls.extend(this_fpls)
                blib.set_param_chain(t, fpls, "fpl", "fpl")
                notes.append("add explicit plural to f=%s" % ",".join(fs))
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

        if tn == old_adj_template:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            deff = make_feminine(pagetitle)
            defmpl = make_plural(pagetitle)
            fs = []
            fullfs = []
            f = getparam(t, "f") or pagetitle
            fullfs.append(f)
            if f == deff:
                f = "+"
            elif f == lemma:
                f = "#"
            fs.append(f)
            f2 = getparam(t, "f2")
            if f2:
                fullfs.append(f2)
                if f2 == deff:
                    f2 == "+"
                fs.append(f2)
            mpls = []
            mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s"
            mpls.append(mpl)
            mpl2 = getparam(t, "mpl2") or getparam(t, "pl2")
            if mpl2:
                mpls.append(mpl2)
            fullmpls = mpls
            # should really check for subsequence but it never occurs
            if set(mpls) == set(defmpl):
                mpls = ["+"]
            elif set(mpls) < set(defmpl):
                pagemsg(
                    "WARNING: mpls=%s subset of defmpl=%s, replacing with default"
                    % (",".join(mpls), ",".join(defmpl)))
                mpls = ["+"]
            mpls = ["#" if x == lemma else x for x in mpls]
            deffpl = [x for f in fullfs for x in make_plural(f)]
            fpls = []
            fpl = getparam(t, "fpl") or getparam(
                t, "pl") or (getparam(t, "f") or pagetitle) + "s"
            fpls.append(fpl)
            fpl2 = getparam(t, "fpl2") or getparam(t, "pl2")
            if fpl2:
                fpls.append(fpl2)
            fullfpls = fpls
            # should really check for subsequence but it never occurs
            if set(fpls) == set(deffpl):
                fpls = ["+"]
            elif set(fpls) < set(deffpl):
                pagemsg(
                    "WARNING: fpls=%s subset of deffpl=%s, replacing with default"
                    % (",".join(fpls), ",".join(deffpl)))
                fpls = ["+"]
            fpls = ["#" if x == lemma else x for x in fpls]
            actual_special = None
            for special in all_specials:
                deff = make_feminine(pagetitle, special)
                if deff is None:
                    continue
                defmpl = make_plural(pagetitle, special)
                deffpl = make_plural(deff, special)
                deff = [deff]
                if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl:
                    actual_special = special
                    break

            head = getparam(t, "head")

            must_continue = False
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn == "1" and pv in ["m", "mf"]:
                    pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" %
                            (pn, pv, unicode(t)))
                    continue
                if pn not in [
                        "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl",
                        "fpl2"
                ]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s in %s" %
                            (pn, pv, unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue

            del t.params[:]
            if head:
                t.add("head", head)
            if fullfs == [pagetitle] and fullmpls == [
                    pagetitle
            ] and fullfpls == [pagetitle]:
                blib.set_template_name(t, "es-adj-inv")
            else:
                blib.set_template_name(t, "es-adj")
                if actual_special:
                    t.add("sp", actual_special)
                else:
                    if fs != ["+"]:
                        blib.set_param_chain(t, fs, "f", "f")

                    if mpls == fpls and ("+" not in mpls or defmpl == deffpl):
                        # masc and fem pl the same
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "pl", "pl")
                    else:
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "mpl", "mpl")
                        if fpls != ["+"]:
                            blib.set_param_chain(t, fpls, "fpl", "fpl")

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("convert {{%s}} to new {{%s}} format" %
                             (old_adj_template, tname(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

    return unicode(parsed), notes
Exemplo n.º 32
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose)
  def verify_template_is_full_line(tn, line):
    templates = list(blib.parse_text(line).filter_templates())
    if type(tn) is list:
      tns = tn
    else:
      tns = [tn]
    tntext = "/".join(tns)
    if len(templates) == 0:
      pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line))
      return None
    t = templates[0]
    if tname(t) not in tns:
      pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" %
          (tntext, tntext, line))
      return None
    if unicode(t) != line:
      pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line))
      return None
    return t

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  sect_for_wiki = 0
  for k in xrange(1, len(subsections), 2):
    if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]):
      sect_for_wiki = k + 1
    elif re.search(r"==\s*Pronunciation\s*==", subsections[k]):
      secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k])
      if secheader != subsections[k]:
        subsections[k] = secheader
        notes.append("remove extraneous spaces in ==Pronunciation== header")
      extra_notes = []
      parsed = blib.parse_text(subsections[k + 1])
      num_it_IPA = 0
      saw_it_pr = False
      for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["it-pr", "it-pronunciation"]:
          saw_it_pr = True
          break
        if tn == "it-IPA":
          num_it_IPA += 1
      if saw_it_pr:
        pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t))
        continue
      if num_it_IPA == 0:
        pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping")
        continue
      if num_it_IPA > 1:
        pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping")
        continue
      lines = subsections[k + 1].strip().split("\n")
      # Remove blank lines.
      lines = [line for line in lines if line]
      hyph_lines = []
      homophone_lines = []
      rfap_lines = []
      rhyme_lines = []
      must_continue = False
      audioarg = ""
      args = []
      bare_args = []
      args_for_hyph = []
      lines_so_far = []
      for lineind, line in enumerate(lines):
        origline = line
        lines_so_far.append(line)
        # In case of "* {{it-IPA|...}}", chop off the "* ".
        line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line)
        if line.startswith("{{it-IPA"):
          if args:
            pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline)
            must_continue = True
            break
          outer_ref_arg = None
          m = re.search("^(.*?) *<ref>(.*?)</ref>$", line)
          if m:
            line, outer_ref_arg = m.groups()
          ipat = verify_template_is_full_line("it-IPA", line)
          if ipat is None:
            must_continue = True
            break
          bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"]
          bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args]
          bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args]
          bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args]
          normalized_bare_args = [
            normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline)))
            for arg in bare_args
          ]
          if None in normalized_bare_args:
            must_continue = True
            break
          args = [x for x in bare_args]

          args_for_hyph = []
          for arg in normalized_bare_args:
            hypharg = (
              arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z")
              .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z")
            )
            hypharg = re.sub(pron_sign_c, "", hypharg)
            putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", ""))
            putative_pagetitle = remove_non_final_accents(putative_pagetitle)
            # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized
            # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not
            # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation
            # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want
            # pronunciation rè to match page title ré or vice versa.)
            if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle:
              args_for_hyph.append(hypharg)

          for param in ipat.params:
            pn = pname(param)
            pv = unicode(param.value)
            if re.search("^[0-9]+$", pn):
              continue
            m = re.search("^(ref|qual)([0-9]*)$", pn)
            if m:
              parampref, argnum = m.groups()
              argnum = int(argnum or "1") - 1
              if argnum >= len(args):
                pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % (
                  pn, pv, origline))
                must_continue = True
                break
              args[argnum] += "<%s:%s>" % (parampref, pv)
            else:
              pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if outer_ref_arg:
            if "<ref:" in args[-1]:
              pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s"
                  % (outer_ref_arg, args[-1], origline))
              must_continue = True
              break
            else:
              args[-1] += "<ref:%s>"  % outer_ref_arg
              extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}")
          continue
        if line.startswith("{{rfap"):
          line = "* " + line
        if line.startswith("{{wiki"):
          subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki]
          # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we
          # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate
          # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section.
          del lines_so_far[-1]
          subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:])
          notes.append("move {{wikipedia}} line to top of etym section")
          continue
        if not line.startswith("* ") and not line.startswith("*{"):
          pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s"
              % origline)
          must_continue = True
          break
        if line.startswith("* "):
          line = line[2:]
        else:
          line = line[1:]
        if line.startswith("{{hyph"):
          hyph_lines.append("* " + line)
        elif line.startswith("{{homophone"):
          homophone_lines.append("* " + line)
        elif line.startswith("{{rfap"):
          rfap_lines.append(line)
        elif line.startswith("{{audio"):
          audiot = verify_template_is_full_line("audio", line)
          if audiot is None:
            must_continue = True
            break
          if getparam(audiot, "1") != "it":
            pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline)
            must_continue = True
            break
          audiofile = getparam(audiot, "2")
          audiogloss = getparam(audiot, "3")
          for param in audiot.params:
            pn = pname(param)
            pv = unicode(param.value)
            if pn not in ["1", "2", "3"]:
              pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if audiogloss in ["Audio", "audio"]:
            audiogloss = ""
          if audiogloss:
            audiogloss = ";%s" % audiogloss
          audiopart = "<audio:%s%s>" % (audiofile, audiogloss)
          audioarg += audiopart
          pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart))
          extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot))
        elif line.startswith("{{rhyme"):
          rhyme_lines.append(line)
        elif remove_accents(line) == remove_accents(pagetitle):
          pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline)
        else:
          pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline)
          must_continue = True
          break
      if must_continue:
        continue

      if rhyme_lines:
        rhyme_error = False
        rhyme_pronuns = []
        for bare_arg in normalized_bare_args:
          pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg))
          if not pronun:
            rhyme_error = True
            break
          rhyme_pronun = (
            re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "")
            .replace(".", ""))
          if rhyme_pronun not in rhyme_pronuns:
            rhyme_pronuns.append(rhyme_pronun)
        if not rhyme_error:
          saw_non_matching_rhyme = False
          normalized_rhymes = []
          rhyme_line_text = ", ".join(rhyme_lines)
          normalized_bare_arg_text = ",".join(normalized_bare_args)
          rhyme_pronun_text = ",".join(rhyme_pronuns)
          for rhyme_line in rhyme_lines:
            rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line)
            if not rhymet:
              break
            if getparam(rhymet, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line))
              break
            rhymes = []
            must_break = False
            num_syl = ""
            rhyme_specific_num_syl = []
            for param in rhymet.params:
              pn = pname(param)
              pv = unicode(param.value)
              if not re.search("^s?[0-9]*$", pn):
                pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                    (pn, pv, tname(rhymet), rhyme_line))
                must_break = True
                break
              if pn == "s":
                num_syl = "<s:%s>" % pv
              elif pn.startswith("s"):
                rhyme_no = int(pn[1:]) - 1
                rhyme_specific_num_syl.append((rhyme_no, pv))
              elif int(pn) > 1:
                if pv:
                  rhymes.append([pv, ""])
            if must_break:
              break
            for rhyme_no, this_num_syl in rhyme_specific_num_syl:
              if rhyme_no >= len(rhymes):
                pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % (
                  rhyme_no + 1, this_num_syl, rhyme_line))
                must_break = True
                break
              rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl
            if must_break:
              break
            for rhyme, this_num_syl in rhymes:
              normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm")
              normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme)
              this_num_syl = this_num_syl or num_syl
              if this_num_syl and not args_for_hyph and not hyph_lines:
                pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s"
                    % (this_num_syl, rhyme, rhyme_line_text))
                saw_non_matching_rhyme = True
                normalized_rhymes.append(normalized_rhyme + this_num_syl)
              else:
                normalized_rhymes.append(normalized_rhyme)
                if rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif normalized_rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif rhyme != normalized_rhyme:
                  pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
                else:
                  pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
          else: # no break
            if saw_non_matching_rhyme:
              pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s"
                  % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
              args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes)
              extra_notes.append("incorporate non-default rhymes into {{it-pr}}")
            else:
              extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}")
            rhyme_lines = []

      if not args:
        pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?")
        continue
      args[-1] += audioarg

      if hyph_lines:
        if len(hyph_lines) > 1:
          pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines))
        else:
          assert hyph_lines[0].startswith("* ")
          hyph_line = hyph_lines[0][2:]
          hyph_templates = re.split(", *", hyph_line)
          hyphs = []
          for hyph_template in hyph_templates:
            hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template)
            if not hypht:
              break
            syls = []
            if getparam(hypht, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template))
              break
            else:
              must_break = False
              for param in hypht.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn) and pn != "nocaption":
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hypht), hyph_line))
                  must_break = True
                  break
                if pn != "nocaption" and int(pn) > 1:
                  if not pv:
                    hyphs.append(syls)
                    syls = []
                  else:
                    syls.append(pv)
              if must_break:
                break
              if syls:
                hyphs.append(syls)
          else: # no break
            if hyphs:
              specified_hyphenations = [".".join(syls) for syls in hyphs]
              specified_hyphenations = [
                re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [
                adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations]
              hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph]
              if set(specified_hyphenations) < set(hyphenations):
                pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
              elif set(specified_hyphenations) != set(hyphenations):
                hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations]
                rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations]
                def indices_of_syllable_markers(hyph):
                  # Get the character indices of the syllable markers, but not counting the syllable markers themselves
                  # (i.e. return the number of characters preceding the syllable marker).
                  raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."]
                  adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)]
                  return set(adjusted_indices)
                if set(specified_hyphenations) == set(hyphenations_without_accents):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif set(rehyphenated_specified_hyphenations) == set(hyphenations):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1
                    and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "")
                    and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                else:
                  if not hyphenations:
                    pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" %
                        (",".join(specified_hyphenations), hyph_line))
                  else:
                    pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" %
                        (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                  args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations)
                  extra_notes.append("incorporate non-default hyphenations into {{it-pr}}")
              else:
                pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line)
                extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}")
              hyph_lines = []

      if homophone_lines:
        if len(homophone_lines) > 1:
          pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines))
        else:
          assert homophone_lines[0].startswith("* ")
          homophone_line = homophone_lines[0][2:]
          homophones = {}
          homophone_qualifiers = {}
          hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line)
          if hmpt:
            if getparam(hmpt, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line))
            else:
              for param in hmpt.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^q?[0-9]+$", pn):
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hmpt), homophone_line))
                  break
                if pn.startswith("q"):
                  homophone_qualifiers[int(pn[1:])] = pv
                elif int(pn) > 1:
                  homophones[int(pn) - 1] = pv
              else: # no break
                hmp_args = []
                for pn, pv in sorted(homophones.items()):
                  hmp_args.append(pv)
                  if pn in homophone_qualifiers:
                    hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn]
                args[-1] += "<hmp:%s>" % ",".join(hmp_args)
                extra_notes.append("incorporate homophones into {{it-pr}}")
                homophone_lines = []

      if args == ["+"]:
        it_pr = "{{it-pr}}"
      else:
        it_pr = "{{it-pr|%s}}" % ",".join(args)
      pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr))

      all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines)
      newsubsec = "%s\n\n" % all_lines
      if subsections[k + 1] != newsubsec:
        this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes
        notes.extend(this_notes)
      subsections[k + 1] = newsubsec

  secbody = "".join(subsections)
  # Strip extra newlines added to secbody
  sections[j] = secbody.rstrip("\n") + sectail
  return "".join(sections), notes
Exemplo n.º 33
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    if tname(t) == "bg-noun-form":
      origt = unicode(t)
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "head"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      rmparam(t, "1")
      rmparam(t, "2")
      head = getparam(t, "head")
      rmparam(t, "head")
      g = getparam(t, "3")
      rmparam(t, "3")
      blib.set_template_name(t, "head")
      t.add("1", "bg")
      t.add("2", "noun form")
      if head:
        t.add("head", head)
      else:
        if bglib.needs_accents(pagetitle):
          pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" %
              unicode(t))
        else:
          t.add("head", pagetitle)
      if g:
        t.add("g", g)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}")

  headt = None
  saw_infl_after_head = False
  saw_headt = False
  saw_inflt = False
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    saw_infl = False
    already_fetched_forms = False
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form":
      saw_headt = True
      if headt and not saw_infl_after_head:
        pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % (
          unicode(headt), origt))
      saw_infl_after_head = False
      headt = t
    if tn == "bg-noun form of":
      saw_inflt = True
      if not headt:
        pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt)
        continue
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "noun"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      saw_infl_after_head = True
      noun = getparam(t, "noun")
      if not noun:
        pagemsg("WARNING: Didn't see noun=: %s" % origt)
        continue
      infls = []
      param2 = getparam(t, "2")
      if param2 == "indefinite":
        infls.append("indef")
      elif param2 == "definite":
        infls.append("def")
      elif param2 == "vocative":
        infls.append("voc")
      elif param2:
        pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt))
        continue
      param3 = getparam(t, "3")
      if param3 == "subject":
        infls.append("sbjv")
      elif param3 == "object":
        infls.append("objv")
      elif param3:
        pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt))
        continue
      param1 = getparam(t, "1")
      if param1 == "singular":
        infls.append("s")
      elif param1 == "plural":
        infls.append("p")
      elif param1 == "count":
        infls.extend(["count", "form"])
      elif param1 == "vocative":
        infls.extend(["voc", "s"])
      else:
        pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt))
        continue
      blib.set_template_name(t, "inflection of")
      del t.params[:]
      t.add("1", "bg")
      lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
      if not lemma:
        pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
        t.add("2", noun)
      else:
        t.add("2", lemma)
      t.add("3", "")
      for i, infl in enumerate(infls):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{bg-noun form of}} to {{inflection of}}")
      tn = tname(t)
      saw_infls = infls_to_slot(infls)
      already_fetched_forms = True
      if not saw_infls:
        pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "inflection of" and getparam(t, "1") == "bg":
      saw_inflt = True
      infls = []
      i = 4
      while True:
        infl = getparam(t, str(i))
        if not infl:
          break
        infls.append(infl)
        i += 1
      saw_infls = infls_to_slot(infls)
      if not saw_infls:
        if "vnoun" in infls:
          pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt))
        elif "part" in infls:
          pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt))
        else:
          pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "definite singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_sg"
    elif tn == "indefinite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "ind_pl"
    elif tn == "definite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_pl"
    elif tn == "vocative singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "voc_sg"
    if saw_infl:
      if not already_fetched_forms:
        noun = getparam(t, "2")
        lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
        if not lemma:
          pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
          continue
        t.add("2", lemma)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn))
      if saw_infl == "def_sg":
        def_sub_sg = forms.get("def_sub_sg", None)
        def_obj_sg = forms.get("def_obj_sg", None)
        if def_sub_sg != def_obj_sg:
          pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % (
            def_sub_sg, def_obj_sg))
          continue
        form = def_sub_sg
      else:
        form = forms.get(saw_infl, None)
      if not form:
        pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" %
            (saw_infl, format_forms(forms)))
        continue
      form = form.split(",")
      filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle]
      if not filtered_form:
        pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form)))
        continue
      form = filtered_form
      existing_form = blib.fetch_param_chain(headt, "head", "head")
      if existing_form:
        must_continue = False
        for f in existing_form:
          if bglib.remove_accents(f) != pagetitle:
            pagemsg("WARNING: Existing head %s doesn't match page title: %s" % (
              f, unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue
        needs_accents = [bglib.needs_accents(f) for f in existing_form]
        if any(needs_accents) and not all(needs_accents):
          pagemsg("WARNING: Some but not all existing heads missing accents: %s" %
              unicode(headt))
          continue
        if not any(needs_accents):
          if existing_form != form:
            pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % (
              saw_infl, ",".join(existing_form), ",".join(form)))
          continue
      origheadt = unicode(headt)
      blib.set_param_chain(headt, form, "head", "head")
      pagemsg("Replaced %s with %s" % (origheadt, unicode(headt)))
      notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form)))

  if saw_headt and not saw_inflt:
    pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt))

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn in template_to_infl_codes and getparam(t, "1") == "bg":
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      infl_codes = template_to_infl_codes[tn]
      blib.set_template_name(t, "inflection of")
      t.add("3", "")
      for i, infl in enumerate(infl_codes):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{%s}} to {{inflection of}}" % tn)

  return unicode(parsed), notes
Exemplo n.º 34
0
      pagemsg("WARNING: Found ru-pre-reform template, skipping")
      return

  if not headword_template:
    pagemsg("WARNING: Can't find headword template, skipping")
    return

  pagemsg("Found headword template: %s" % unicode(headword_template))

  headword_is_proper = unicode(headword_template.name) == "ru-proper noun"

  if getparam(headword_template, "3") == "-" or "[[Category:Russian indeclinable nouns]]" in page.text:
    pagemsg("WARNING: Indeclinable noun, skipping")
    return

  headword_trs = blib.fetch_param_chain(headword_template, "tr", "tr")
  if headword_trs:
    pagemsg("WARNING: Found headword manual translit, skipping: %s" %
        ",".join(headword_trs))
    return

  headword = getparam(headword_template, "1")
  for badparam in ["head2", "gen2", "pl2"]:
    val = getparam(headword_template, badparam)
    if val:
      pagemsg("WARNING: Found extra param, can't handle, skipping: %s=%s" % (
        badparam, val))
      return

  # Here we use a capturing split, and treat what we want to capture as
  # the splitting text, backwards from what you'd expect. The separators
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    pagemsg("Processing")

    notes = []

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn not in ["la-noun", "la-proper noun"]:
            continue

        origt = unicode(t)

        def render_headword():
            return "headword template <from> %s <to> %s <end>" % (origt, origt)

        if getparam(t, "indecl"):
            pagemsg("Skipping indeclinable noun: %s" % render_headword())
            continue
        new_style_headword_template = (not getparam(t, "head2")
                                       and not getparam(t, "2")
                                       and not getparam(t, "3")
                                       and not getparam(t, "4")
                                       and not getparam(t, "decl"))
        if new_style_headword_template:
            pagemsg("Skipping new-style template: %s" % render_headword())
            continue
        lemma = blib.fetch_param_chain(t, ["1", "head", "head1"],
                                       "head") or [pagetitle]
        genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen")
        noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g")
        noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl")
        if " " in lemma[0]:
            pagemsg("WARNING: Space in lemma %s, skipping: %s" %
                    (lemma[0], render_headword()))
            continue
        if len(lemma) > 1:
            pagemsg("WARNING: Multiple lemmas %s, skipping: %s" %
                    (",".join(lemma), render_headword()))
            continue
        lemma = lemma[0]
        noun_decl_to_decl_type = {
            "first": "1",
            "second": "2",
            "third": "3",
            "fourth": "4",
            "fifth": "5",
            "irregular": "irreg",
        }
        if len(noun_decl) == 0:
            pagemsg("WARNING: No declension, skipping: %s" % render_headword())
            continue
        if len(noun_decl) > 1:
            pagemsg("WARNING: Multiple decls %s, skipping: %s" %
                    (",".join(noun_decl), render_headword()))
            continue
        noun_decl = noun_decl[0]
        if noun_decl not in noun_decl_to_decl_type:
            pagemsg("WARNING: Unrecognized declension %s, skipping: %s" %
                    (noun_decl, render_headword()))
            continue
        decl_type = noun_decl_to_decl_type[noun_decl]
        if decl_type in ["1", "2", "4", "5"]:
            param1 = "%s<%s>" % (lemma, decl_type)
        elif decl_type == "3":
            if len(genitive) == 0:
                pagemsg(
                    "WARNING: No genitives with decl 3 lemma %s, skipping: %s"
                    % (lemma, render_headword()))
                continue
            elif len(genitive) > 1:
                pagemsg(
                    "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s"
                    % (",".join(genitive), lemma, render_headword()))
                continue
            else:
                gen1 = genitive[0]
                if gen1.endswith("is"):
                    stem = gen1[:-2]
                    if lalib.infer_3rd_decl_stem(lemma) == stem:
                        param1 = "%s<3>" % lemma
                    else:
                        param1 = "%s/%s<3>" % (lemma, stem)
                elif gen1.endswith("ium"):
                    if lemma.endswith("ia"):
                        param1 = "%s<3.pl>" % lemma
                    elif lemma.endswith(u"ēs"):
                        param1 = "%s<3.I.pl>" % lemma
                    else:
                        pagemsg(
                            "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s"
                            % (lemma, render_headword()))
                        continue
                elif gen1.endswith("um"):
                    if lemma.endswith("a") or lemma.endswith(u"ēs"):
                        param1 = "%s<3.pl>" % lemma
                    else:
                        pagemsg(
                            "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s"
                            % (lemma, render_headword()))
                        continue
                else:
                    pagemsg(
                        "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s"
                        % (gen1, lemma, render_headword()))
                    continue
        elif decl_type == "irreg":
            pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" %
                    render_headword())
            continue
        else:
            pagemsg(
                "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s"
                % (decl_type, render_headword()))
            continue
        la_ndecl = "{{la-ndecl|%s}}" % param1
        noun_props = convert_la_headword_noun.new_generate_noun_forms(
            la_ndecl, errandpagemsg, expand_text, include_props=True)
        if noun_props is None:
            continue
        decl_gender = noun_props.get("g", None)
        if not convert_la_headword_noun.compare_headword_decl_forms(
                "genitive",
                genitive, ["gen_sg", "gen_pl"],
                noun_props,
                render_headword(),
                pagemsg,
                adjust_for_missing_gen_forms=True,
                adjust_for_e_ae_gen=True,
                remove_headword_links=True):
            continue
        if len(noun_gender) == 1 and noun_gender[0] == decl_gender:
            need_explicit_gender = False
        else:
            need_explicit_gender = True
            if len(noun_gender) > 1:
                pagemsg(
                    "WARNING: Saw multiple headword genders %s, please verify: %s"
                    % (",".join(noun_gender), render_headword()))
            elif (noun_gender and noun_gender[0].startswith("n") !=
                  (decl_gender == "n")):
                pagemsg(
                    "WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s"
                    % (noun_gender[0], decl_gender, render_headword()))
                continue

        # Fetch remaining params from headword template
        headword_params = []
        for param in t.params:
            pname = unicode(param.name)
            if pname.strip() in ["1", "2", "3", "4"] or re.search(
                    "^(head|gen|g|decl)[0-9]*$", pname.strip()):
                continue
            headword_params.append((pname, param.value, param.showkey))
        # Erase all params
        del t.params[:]
        # Add param1
        t.add("1", param1)
        # Add explicit gender if needed
        if need_explicit_gender:
            explicit_genders = []
            for ng in noun_gender:
                ng = ng[0]
                if ng not in explicit_genders:
                    explicit_genders.append(ng)
            blib.set_param_chain(t, explicit_genders, "g", "g")
        # Copy remaining params from headword template
        for name, value, showkey in headword_params:
            t.add(name, value, showkey=showkey, preserve_spacing=False)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append(
            "convert {{la-noun}}/{{la-proper noun}} params to new style")

    return unicode(parsed), notes
def process_page(index, page, save, verbose, fixdirecs):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  saw_paired_verb = False
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-verb":
      saw_paired_verb = False
      if getparam(t, "2") in ["impf", "both"]:
        verb = getparam(t, "1")
        pfs = blib.fetch_param_chain(t, "pf", "pf")
        impfs = blib.fetch_param_chain(t, "impf", "impf")
        for otheraspect in pfs + impfs:
          if verb[0:2] == otheraspect[0:2]:
            saw_paired_verb = True
    if (unicode(t.name) in ["ru-conj", "ru-conj-old"] and
        getparam(t, "1") == "impf" and not saw_paired_verb):
      if getparam(t, "ppp") or getparam(t, "past_pasv_part"):
        pass
      elif [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        pass
      elif re.search(r"\+p|\[?\([78]\)\]?", getparam(t, "2"))):
        pass
      else:
        pagemsg("Apparent unpaired transitive imperfective without PPP")
        if pagetitle in fixdirecs:
          direc = fixdirecs[pagetitle]
          assert direc in ["fixed", "paired", "intrans", "+p", "|ppp=-"]
          origt = unicode(t)
          if direc == "+p":
            t.add("2", getparam(t, "2") + "+p")
            notes.append("add missing past passive participle to transitive unpaired imperfective verb")
            pagemsg("Add missing PPP, replace %s with %s" % (origt, unicode(t)))
          elif direc == "|ppp=-":
            t.add("ppp", "-")
            notes.append("note transitive unpaired imperfective verb as lacking past passive participle")
            pagemsg("Note no PPP, replace %s with %s" % (origt, unicode(t)))
          elif direc == "paired":
            pagemsg("Verb actually is paired")
          elif direc == "fixed":
            pagemsg("WARNING: Unfixed verb marked as fixed")
          elif direc == "intrans":
            pagemsg("WARNING: Transitive verb marked as intrans")

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed):
  global args
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  origtext = text

  notes = []

  retval = lalib.find_latin_section(text, pagemsg)
  if retval is None:
    return None, None

  sections, j, secbody, sectail, has_non_latin = retval

  subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M)

  saw_a_template = False

  for k in xrange(2, len(subsections), 2):
    parsed = blib.parse_text(subsections[k])
    la_noun_template = None
    la_ndecl_template = None
    must_continue = False
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "la-ndecl":
        if la_ndecl_template:
          pagemsg("WARNING: Saw multiple noun declension templates in subsection, %s and %s, skipping" % (
            unicode(la_ndecl_template), unicode(t)))
          must_continue = True
          break
        la_ndecl_template = t
        saw_a_template = True
      if tn in ["la-noun", "la-proper noun", "la-location"] or (
        tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") in ["noun", "proper noun"]
      ):
        if la_noun_template:
          pagemsg("WARNING: Saw multiple noun headword templates in subsection, %s and %s, skipping" % (
            unicode(la_noun_template), unicode(t)))
          must_continue = True
          break
        la_noun_template = t
        saw_a_template = True
    if must_continue:
      continue
    if not la_noun_template and not la_ndecl_template:
      continue
    new_style_headword_template = (
      la_noun_template and
      tname(la_noun_template) in ["la-noun", "la-proper noun"] and
      not getparam(la_noun_template, "head2") and
      not getparam(la_noun_template, "2") and
      not getparam(la_noun_template, "3") and
      not getparam(la_noun_template, "4") and
      not getparam(la_noun_template, "decl")
    )
    if la_noun_template and not la_ndecl_template:
      if (tname(la_noun_template) in ["la-noun", "la-proper noun"] and
          getparam(la_noun_template, "indecl")):
        if new_style_headword_template:
          pagemsg("Found new-style indeclinable noun headword template, skipping: %s" %
            unicode(la_noun_template))
          continue
        if (getparam(la_noun_template, "head2") or
            getparam(la_noun_template, "decl") or
            getparam(la_noun_template, "2") and
            getparam(la_noun_template, "2") != getparam(la_noun_template, "1") or
            not getparam(la_noun_template, "3")):
          pagemsg("WARNING: Found old-style indeclinable noun headword template and don't know how to convert: %s" %
              unicode(la_noun_template))
          continue
        gender = getparam(la_noun_template, "3")
        orig_la_noun_template = unicode(la_noun_template)
        la_noun_template.add("g", gender[0], before="3")
        rmparam(la_noun_template, "3")
        rmparam(la_noun_template, "2")
        pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
        notes.append("convert indeclinable {{la-noun}}/{{la-proper noun}} template to new style")
        subsections[k] = unicode(parsed)
        continue
      else:
        pagemsg("WARNING: Saw noun headword template but no declension template: %s" % unicode(la_noun_template))
        continue
    if la_ndecl_template and not la_noun_template:
      pagemsg("WARNING: Saw noun declension template but no headword template: %s" % unicode(la_ndecl_template))
      continue

    orig_la_noun_template = unicode(la_noun_template)
    if new_style_headword_template:
      pagemsg("Found new-style noun headword template, skipping: %s" %
        orig_la_noun_template)
      continue

    def render_headword_and_decl():
      return "headword template <from> %s <to> %s <end>, declension template <from> %s <to> %s <end>" % (
        orig_la_noun_template, orig_la_noun_template,
        unicode(la_ndecl_template), unicode(la_ndecl_template)
      )

    if tname(la_noun_template) == "head":
      explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["head", "head1"], "head")
      lemma = explicit_head_param_head or [pagetitle]
    elif tname(la_noun_template) == "la-location":
      explicit_head_param_head = [getparam(la_noun_template, "1")]
    else:
      explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["1", "head", "head1"], "head")
    lemma = explicit_head_param_head or [pagetitle]
    if "[[" in lemma[0]:
      if len(lemma) > 1:
        pagemsg("WARNING: Multiple lemmas %s and lemmas with links in them, can't handle, skipping: %s" % (
          ",".join(lemma), render_headword_and_decl()
        ))
        continue
      ndecl_lemma = getparam(la_ndecl_template, "1")
      if "[[" not in ndecl_lemma:
        must_continue = False
        for m in re.finditer(r"(\[\[.*?\]\])", lemma[0]):
          link = m.group(1)
          plainlink = blib.remove_links(link)
          if plainlink not in ndecl_lemma:
            pagemsg("WARNING: Can't interpolate link %s into declension template, skipping: %s" % (
              link, render_headword_and_decl()))
            must_continue = True
            break
          ndecl_lemma = ndecl_lemma.replace(plainlink, link, 1)
        if must_continue:
          continue
        new_ndecl_template = blib.parse_text(unicode(la_ndecl_template)).filter_templates()[0]
        new_ndecl_template.add("1", ndecl_lemma)
        pagemsg("Adding links to decl template %s to produce %s" % (
          unicode(la_ndecl_template), unicode(new_ndecl_template)))
        la_ndecl_template = new_ndecl_template

    noun_props = new_generate_noun_forms(unicode(la_ndecl_template), errandpagemsg, expand_text, include_props=True)
    if noun_props is None:
      continue
    decl_gender = noun_props.get("g", None)

    if tname(la_noun_template) == "head":
      noun_gender = blib.fetch_param_chain(la_noun_template, ["g", "g1"], "g")
      if not noun_gender and not decl_gender:
        pagemsg("WARNING: No gender in {{head|la|...}} and no declension gender, can't proceed, skipping: %s" % render_headword_and_decl())
        continue
    elif tname(la_noun_template) == "la-location":
      noun_gender = [getparam(la_noun_template, "4")]
    else:
      noun_gender = blib.fetch_param_chain(la_noun_template, ["3", "g", "g1"], "g")
      if not noun_gender:
        pagemsg("WARNING: No gender in old-style headword, skipping: %s" % render_headword_and_decl())
        continue

    def do_compare_headword_decl_forms(id_slot, headword_forms, decl_slots,
        adjust_for_missing_gen_forms=False, remove_headword_links=False):
      return compare_headword_decl_forms(id_slot, headword_forms, decl_slots,
        noun_props, render_headword_and_decl(), pagemsg,
        adjust_for_missing_gen_forms=adjust_for_missing_gen_forms,
        remove_headword_links=remove_headword_links)

    def check_headword_vs_decl_decls(regularized_noun_decl):
      must_continue = False
      decl_lemma = getparam(la_ndecl_template, "1") 
      if "((" in decl_lemma:
        pagemsg("WARNING: (( in decl_lemma, can't handle, skipping: %s" %
            render_headword_and_decl())
        must_continue = True
        return
      segments = re.split(r"([^<> -]+<[^<>]*>)", decl_lemma)
      decl_decls = []
      for i in xrange(1, len(segments) - 1, 2):
        m = re.search("^([^<> -]+)<([^<>]*)>$", segments[i])
        stem_spec, decl_and_subtype_spec = m.groups()
        decl_and_subtypes = decl_and_subtype_spec.split(".")
        decl_decl = decl_and_subtypes[0]
        decl_decls.append(decl_decl)
      if set(regularized_noun_decl) != set(decl_decls):
        if set(regularized_noun_decl) <= set(decl_decls):
          pagemsg("headword decl %s subset of declension decl %s, allowing: %s" % (
            ",".join(regularized_noun_decl), ",".join(decl_decls),
            render_headword_and_decl()))
        else:
          pagemsg("WARNING: headword decl %s not same as or subset of declension decl %s, skipping: %s" % (
            ",".join(regularized_noun_decl), ",".join(decl_decls),
            render_headword_and_decl()))
          must_continue = True
      return must_continue

    def check_headword_vs_decl_gender():
      must_continue = False
      if len(noun_gender) == 1 and noun_gender[0] == decl_gender:
        need_explicit_gender = False
      else:
        need_explicit_gender = True
        if len(noun_gender) > 1:
          pagemsg("WARNING: Saw multiple headword genders %s, please verify: %s" % (
            ",".join(noun_gender), render_headword_and_decl()))
        elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")):
          pagemsg("WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % (
          noun_gender[0], decl_gender, render_headword_and_decl()))
          must_continue = True
      return need_explicit_gender, must_continue

    def erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender):
      # Erase all params
      del la_noun_template.params[:]
      # Copy params from decl template
      for param in la_ndecl_template.params:
        pname = unicode(param.name)
        la_noun_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False)
      # Add explicit gender if needed
      if need_explicit_gender:
        explicit_genders = []
        for ng in noun_gender:
          ng = ng[0]
          if ng not in explicit_genders:
            explicit_genders.append(ng)
        blib.set_param_chain(la_noun_template, explicit_genders, "g", "g")

    if tname(la_noun_template) == "head":
      if explicit_head_param_head and not do_compare_headword_decl_forms("lemma", explicit_head_param_head, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Check for extraneous {{head|la|...}} parameters
      must_continue = False
      is_proper_noun = getparam(la_ndecl_template, "2") == "proper noun"
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2"] or re.search("^(head|g)[0-9]*$", pname.strip()):
          continue
        pagemsg("WARNING: Saw extraneous param %s in {{head}} template, skipping: %s" % (
          pname, render_headword_and_decl()))
        must_continue = True
        break
      if must_continue:
        continue
      # Copy params from decl template
      blib.set_template_name(la_noun_template,
        "la-proper noun" if is_proper_noun else "la-noun")
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{head|la|...}} to new-style {{la-noun}}/{{la-proper noun}} template")

    elif tname(la_noun_template) == "la-location":
      noun_decl = [getparam(la_noun_template, "6")]
      if not noun_decl:
        pagemsg("WARNING: No noun decl in {{la-location}}, skipping: %s" % render_headword_and_decl())
        continue
      genitive = [getparam(la_noun_template, "2")]
      if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"],
          adjust_for_missing_gen_forms=True, remove_headword_links=True):
        continue
      regularized_noun_decl = []
      must_continue = False
      for nd in noun_decl:
        if nd not in noun_decl_to_decl_type:
          pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % (
            nd, render_headword_and_decl()))
          must_continue = True
          break
        regularized_noun_decl.append(noun_decl_to_decl_type[nd])
      if must_continue:
        continue
      must_continue = check_headword_vs_decl_decls(regularized_noun_decl)
      if must_continue:
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Check for extraneous {{la-location}} parameters
      must_continue = False
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2", "3", "4", "5", "6"]:
          continue
        pagemsg("WARNING: Saw extraneous param %s in {{la-location}} template, skipping: %s" % (
          pname, render_headword_and_decl()))
        must_continue = True
        break
      if must_continue:
        continue
      blib.set_template_name(la_noun_template, "la-proper noun")
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{la-location}} to new-style {{la-proper noun}} template")

    else:
      # old-style {{la-noun}} or {{la-proper noun}}
      noun_decl = blib.fetch_param_chain(la_noun_template, ["4", "decl", "decl1"], "decl")
      if not noun_decl:
        pagemsg("WARNING: No noun decl in old-style headword, skipping: %s" % render_headword_and_decl())
        continue
      genitive = blib.fetch_param_chain(la_noun_template, ["2", "gen", "gen1"], "gen")
      if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"],
          adjust_for_missing_gen_forms=True, remove_headword_links=True):
        continue
      regularized_noun_decl = []
      must_continue = False
      for nd in noun_decl:
        if nd not in noun_decl_to_decl_type:
          pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % (
            nd, render_headword_and_decl()))
          must_continue = True
          break
        regularized_noun_decl.append(noun_decl_to_decl_type[nd])
      if must_continue:
        continue

      must_continue = check_headword_vs_decl_decls(regularized_noun_decl)
      if must_continue:
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Fetch remaining params from headword template
      headword_params = []
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2", "3", "4"] or re.search("^(head|gen|g|decl)[0-9]*$", pname.strip()):
          continue
        headword_params.append((pname, param.value, param.showkey))
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      # Copy remaining params from headword template
      for name, value, showkey in headword_params:
        la_noun_template.add(name, value, showkey=showkey, preserve_spacing=False)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{la-noun}}/{{la-proper noun}} params to new style")

    subsections[k] = unicode(parsed)

  if not saw_a_template:
    pagemsg("WARNING: Saw no noun headword or declension templates")

  secbody = "".join(subsections)
  sections[j] = secbody + sectail
  return "".join(sections), notes
Exemplo n.º 38
0
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, []

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  frobbed_manual_translit = []
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  # Retrieve headword translit and maybe transfer to decl
  headword_tr = getparam(headword_template, "tr")
  if headword_tr:
    if verbose:
      pagemsg("Found headword manual translit tr=%s" % headword_tr)
    if "," in headword_tr:
      pagemsg("WARNING: Comma in headword manual translit, skipping: %s" %
          headword_tr)
      return None
    # Punt if multi-arg-set, can't handle yet
    for decl_template in decl_templates:
      for param in decl_template.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
      for i in xrange(2, 10):
        if getparam(headword_template, "tr%s" % i):
          pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (
            i, unicode(headword_template)))
          return None
      if runoun.arg1_is_stress(getparam(decl_template, "1")):
        lemma_arg = "2"
      else:
        lemma_arg = "1"
      lemmaval = getparam(decl_template, lemma_arg)
      if not lemmaval:
        lemmaval = subpagetitle
      if "//" in lemmaval:
        m = re.search("^(.*?)//(.*)$", lemmaval)
        if m.group(2) != headword_tr:
          pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (
            lemmaval, headword_tr))
          return None
        else:
          pagemsg("Already found manual translit in decl template %s" %
              lemmaval)
      else:
        lemmaval += "//" + headword_tr
        orig_decl_template = unicode(decl_template)
        decl_template.add(lemma_arg, lemmaval)
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))
        frobbed_manual_translit = [headword_tr]

  genders = blib.fetch_param_chain(headword_template, "2", "g")

  bian_replaced = 0

  # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
  # headword template
  for decl_template in decl_templates:
    if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
      saw_in = -1
      saw_an = -1
      for i,g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
          saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
          saw_an = i
      if saw_in >= 0 and saw_an >= 0:
        orig_decl_template = unicode(decl_template)
        if saw_in < saw_an:
          pagemsg("Replacing a=bi with a=ia in decl template")
          decl_template.add("a", "ia")
          bian_replaced = 1
        else:
          pagemsg("Replacing a=bi with a=ai in decl template")
          decl_template.add("a", "ai")
          bian_replaced = 1
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))

  generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
      unicode(noun_table_template))
  generate_result = expand_text(generate_template)
  if not generate_result:
    pagemsg("WARNING: Error generating noun args, skipping")
    return None
  args = ru.split_generate_args(generate_result)

  genders = runoun.check_old_noun_headword_forms(headword_template, args,
      subpagetitle, pagemsg)
  if genders == None:
    return None

  new_params = []
  for param in noun_table_template.params:
    new_params.append((param.name, param.value))

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      new_params, genders, pagemsg)
  if params_to_preserve == None:
    return None

  if unicode(headword_template.name) == "ru-proper noun":
    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(headword_template, "n"):
      pagemsg("Adding n=both to headword tempate")
      headword_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        else:
          pagemsg("Removing n=sg from headword tempate")
          rmparam(headword_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  headword_template.params.extend(params_to_preserve)
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    ru_noun_changed = 1
  else:
    headword_template.name = "ru-proper noun+"
    ru_proper_noun_changed = 1

  pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template)))

  return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit