Exemplo n.º 1
0
 def fixup_one_link(m):
   lemma, infl = m.groups()
   # Make sure to remove accents, cf. [[десе́ртный|десе́ртное]]
   lemma = ru.remove_accents(re.sub("#Russian$", "", lemma))
   if ru.remove_accents(infl) == lemma:
     return "[[%s]]" % infl
   return "[[%s|%s]]" % (lemma, infl)
Exemplo n.º 2
0
 def fixup_one_link(m):
     lemma, infl = m.groups()
     # Make sure to remove accents, cf. [[десе́ртный|десе́ртное]]
     lemma = rulib.remove_accents(re.sub("#Russian$", "", lemma))
     if rulib.remove_accents(infl) == lemma:
         return "[[%s]]" % infl
     return "[[%s|%s]]" % (lemma, infl)
def process_template(pagetitle, index, template, ruparam, trparam, output_line,
    find_accents, verbose):
  origt = unicode(template)
  saveparam = ruparam
  def pagemsg(text):
    msg("Page %s %s: %s" % (index, pagetitle, text))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, semi_verbose)
  if semi_verbose:
    pagemsg("Processing template: %s" % unicode(template))
  if unicode(template.name) == "head":
    # Skip {{head}}. We don't want to mess with headwords.
    return False
  if isinstance(ruparam, list):
    ruparam, saveparam = ruparam
  if ruparam == "page title":
    val = pagetitle
  else:
    val = getparam(template, ruparam)
  valtr = getparam(template, trparam) if trparam else ""
  changed = False
  if find_accents:
    newval, newtr = find_accented(val, valtr, verbose, pagemsg, expand_text,
        origt)
    if newval != val or newtr != valtr:
      if ru.remove_accents(newval) != ru.remove_accents(val):
        pagemsg("WARNING: Accented page %s changed from %s in more than just accents, not changing" % (newval, val))
      else:
        changed = True
        addparam(template, saveparam, newval)
        if newtr:
          if not trparam:
            pagemsg("WARNING: Unable to change translit to %s because no translit param available (Cyrillic param %s): %s" %
                (newtr, saveparam, origt))
          elif unicode(template.name) in ["ru-ux"]:
            pagemsg("WARNING: Not changing or adding translit param %s=%s to ru-ux: origt=%s" % (
              trparam, newtr, origt))
          else:
            if valtr and valtr != newtr:
              pagemsg("WARNING: Changed translit param %s from %s to %s: origt=%s" %
                  (trparam, valtr, newtr, origt))
            if not valtr:
              pagemsg("NOTE: Added translit param %s=%s to template: origt=%s" %
                  (trparam, newtr, origt))
            addparam(template, trparam, newtr)
        elif valtr:
          pagemsg("WARNING: Template has translit %s but lookup result has none, leaving translit alone: origt=%s" %
              (valtr, origt))
        if check_need_accent(newval):
          output_line("Need accents (changed)")
        else:
          output_line("Found accents")
  if not changed and check_need_accent(val):
    output_line("Need accents")
  if changed:
    pagemsg("Replaced %s with %s" % (origt, unicode(template)))
  return ["auto-accent %s%s" % (newval, "//%s" % newtr if newtr else "")] if changed else False
 def sort_aspect_pair(x, y):
   xpf, ximpf = x
   ypf, yimpf = y
   # First compare ignoring accents, so that влить goes before вли́ться,
   # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered
   # consistently.
   retval = compare_aspect_pair(ru.remove_accents(xpf), ru.remove_accents(ximpf),
     ru.remove_accents(ypf), ru.remove_accents(yimpf))
   if retval == 0:
     return compare_aspect_pair(xpf, ximpf, ypf, yimpf)
   else:
     return retval
 def sort_aspect_pair(x, y):
     xpf, ximpf = x
     ypf, yimpf = y
     # First compare ignoring accents, so that влить goes before вли́ться,
     # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered
     # consistently.
     retval = compare_aspect_pair(rulib.remove_accents(xpf),
                                  rulib.remove_accents(ximpf),
                                  rulib.remove_accents(ypf),
                                  rulib.remove_accents(yimpf))
     if retval == 0:
         return compare_aspect_pair(xpf, ximpf, ypf, yimpf)
     else:
         return retval
 def process_arg_set(arg_set):
     if not arg_set:
         return
     offset = 0
     if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]):
         offset = 1
     if len(arg_set) <= offset:
         return
     # Remove * meaning non-stressed
     lemma = re.sub(r"^\*", "", arg_set[offset])
     # Remove translit
     lemma = re.sub("//.*$", "", lemma)
     if not lemma:
         return
     headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma)
     if headwords_separators[0] != "" or headwords_separators[-1] != "":
         pagemsg(
             "WARNING: Found junk at beginning or end of headword, skipping: %s"
             % lemma)
         return
     wordind = 0
     for i in xrange(1, len(headwords_separators), 2):
         hword = headwords_separators[i]
         separator = headwords_separators[i + 1]
         if i < len(headwords_separators
                    ) - 2 and separator != " " and separator != "-":
             pagemsg(
                 "WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>"
                 % (wordind + 1, hword, separator))
             continue
         hword = hword.replace("#Russian", "")
         hword = rulib.remove_accents(blib.remove_right_side_links(hword))
         check_lemma(hword)
         wordind += 1
 def process_arg_set(arg_set):
   if not arg_set:
     return
   offset = 0
   if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]):
     offset = 1
   if len(arg_set) <= offset:
     return
   # Remove * meaning non-stressed
   lemma = re.sub(r"^\*", "", arg_set[offset])
   # Remove translit
   lemma = re.sub("//.*$", "", lemma)
   if not lemma:
     return
   headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma)
   if headwords_separators[0] != "" or headwords_separators[-1] != "":
     pagemsg("WARNING: Found junk at beginning or end of headword, skipping: %s" % lemma)
     return
   wordind = 0
   for i in xrange(1, len(headwords_separators), 2):
     hword = headwords_separators[i]
     separator = headwords_separators[i+1]
     if i < len(headwords_separators) - 2 and separator != " " and separator != "-":
       pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" %
           (wordind + 1, hword, separator))
       continue
     hword = hword.replace("#Russian", "")
     hword = rulib.remove_accents(blib.remove_right_side_links(hword))
     check_lemma(hword)
     wordind += 1
Exemplo n.º 8
0
 def add_links(m):
     prefix = m.group(1)
     if re.search(u"[гкх]о$", prefix):
         first = prefix[:-1] + u"ий"
     else:
         first = prefix[:-1] + u"ый"
     return u"[[%s|%s]]-[[%s]]" % (rulib.remove_accents(first),
                                   prefix, m.group(2))
Exemplo n.º 9
0
def process_decl(index, pagetitle, decl, forms, save, verbose):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if decl.startswith("{{ru-conj|"):
    tempcall = re.sub(r"^\{\{ru-conj", "{{ru-generate-verb-forms", decl)
  elif decl.startswith("{{ru-noun-table"):
    tempcall = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", decl)
  else:
    pagemsg("WARNING: Unrecognized decl template, skipping: %s" % decl)
    return

  result = expand_text(tempcall)
  if not result:
    pagemsg("WARNING: Error generating forms, skipping")
    return
  args = blib.split_generate_args(result)

  for form in forms:
    if form in args:
      for formpagename in re.split(",", args[form]):
        formpagename = re.sub("//.*$", "", formpagename)
        formpagename = rulib.remove_accents(formpagename)
        formpage = pywikibot.Page(site, formpagename)
        if not formpage.exists():
          pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
        elif formpagename == pagetitle:
          pagemsg("WARNING: Attempt to delete dictionary form, skipping")
        else:
          text = unicode(formpage.text)
          if "Etymology 1" in text:
            pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
          else:
            skip_form = False
            for m in re.finditer(r"^==([^=]*?)==$", text, re.M):
              if m.group(1) != "Russian":
                pagemsg("WARNING: Found entry for non-Russian language %s, skipping form %s" %
                    (m.group(1), formpagename))
                skip_form = True
            if not skip_form:
              comment = "Delete erroneously created form of %s" % pagetitle
              if save:
                formpage.delete(comment)
              else:
                pagemsg("Would delete page %s with comment=%s" %
                    (formpagename, comment))
 def process_verb_headword(htemp):
     # Look for either space-delimited words or bracket-delimited sections.
     words = [
         x for num, x in enumerate(
             re.split(r"([^\s\[\]]+|\[\[.*?\]\])", getparam(htemp, "1")))
         if num % 2 == 1
     ]
     for word in words:
         word = word.replace("#Russian", "")
         word = rulib.remove_accents(blib.remove_right_side_links(word))
         if "[" in word or "]" in word:
             pagemsg("WARNING: Found stray bracket in word %s in %s" %
                     (word, unicode(htemp)))
         else:
             check_lemma(word)
Exemplo n.º 11
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "wikipedia":
      val = getparam(t, "1")
      newval = ru.remove_accents(val)
      if val != newval:
        pagemsg("Removing accents from 1= in {{wikipedia|...}}")
        notes.append("remove accents from 1= in {{wikipedia|...}}")
        t.add("1", newval)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Exemplo n.º 12
0
def process_page(index, page, save, verbose, nouns, adjectives):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  if re.search(u"с[яь]$", pagetitle):
    pagemsg("Skipping reflexive verb")
    return

  text = unicode(page.text)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-conj":
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      conjtype = getparam(t, "2")
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      args = blib.split_generate_args(result)
      if "infinitive" not in args: # e.g. обнимать
        pagemsg("WARNING: No infinitive")
        continue
      infinitive = args["infinitive"]
      if "," in infinitive:
        pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive)
        continue
      if "//" in infinitive:
        pagemsg("WARNING: Infinitive has translit: %s" % infinitive)
        continue
      ppp = form_ppp(conjtype, pagetitle, args)
      if not ppp:
        continue
      if ppp.endswith(u"тый"):
        verbal_noun = re.sub(u"тый$", u"тие", ppp)
        verbal_noun_suffix = u"тие"
        verbal_adj = re.sub(u"тый$", u"тельный", ppp)
        verbal_adj_suffix = u"тельный"
      elif ppp.endswith(u"ённый"):
        verbal_noun = re.sub(u"ённый$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"ённый$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      elif ppp.endswith(u"енный"):
        verbal_noun = re.sub(u"енный$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"енный$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      else:
        assert ppp.endswith(u"анный") or ppp.endswith(u"янный")
        verbal_noun = re.sub(u"нный$", u"ние", ppp)
        verbal_adj = re.sub(u"нный$", u"тельный", ppp)
        m = re.search(u"(.)нный$", ppp)
        suffix_start = m.group(1)
        verbal_noun_suffix = suffix_start + u"ние"
        verbal_adj_suffix = suffix_start + u"тельный"
      agent_noun = re.sub(u"ный$", "", verbal_adj)
      agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix)
      stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix)
      stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix)
      stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix)
      if conjtype.startswith("7"):
        stem = getparam(t, "4")
        if infinitive.endswith(u"ть"):
          stem = stem.replace(u"ё", u"е́")
        else:
          stem = rulib.make_unstressed_ru(stem)
        stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-"
      elif conjtype.startswith("8"):
        stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-"
      else:
        stem = rulib.remove_monosyllabic_accents(infinitive)

      if verbal_noun in nouns:
        stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text)
        if not stressed_noun:
          msg("%s no-etym FIXME" % verbal_noun)
        elif stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_noun)
        else:
          if stressed_noun.endswith(stressed_verbal_noun_suffix):
            suffix = stressed_verbal_noun_suffix
          else:
            suffix = verbal_noun_suffix
          msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix))

      if agent_noun in nouns:
        stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text)
        if stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % agent_noun)
        else:
          msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem))

      if verbal_adj in adjectives:
        stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text)
        if stressed_adj == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_adj)
        else:
          msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))
Exemplo n.º 13
0
def process_line(index, line, add_passive_of, override_etym, save, verbose):
  def error(text):
    errmsg("ERROR: Processing line: %s" % line)
    errmsg("ERROR: %s" % text)
    assert False

  def check_stress(word):
    word = re.sub(r"|.*", "", word)
    if word.startswith("-") or word.endswith("-"):
      # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать)
      return
    if rulib.needs_accents(word, split_dash=True):
      error("Word %s missing an accent" % word)

  # Skip lines consisting entirely of comments
  if line.startswith("#"):
    return
  if line.startswith("!"):
    override_etym = True
    line = line[1:]
  # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be
  # included as part of the second element.
  els = do_split(r"\s+", line, 1)
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  if not els[1].startswith("raw:"):
    els = do_split(r"\s+", line)
  # Replace _ with space and \u
  els = [el.replace("_", " ").replace(r"\u", "_") for el in els]
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  accented_term = els[0]
  term = rulib.remove_accents(accented_term)
  etym = els[1]

  pagetitle = term

  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  # Handle etymology
  adjformtext = ""
  if etym == "?":
    error("Etymology consists of bare question mark")
  elif etym == "-":
    etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n"
  elif etym == "--":
    etymtext = ""
  elif re.search(r"^(part|adj|partadj)([fnp]):", etym):
    m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym)
    forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]}
    infleclines = ["# {{inflection of|lang=ru|%s||%s}}" %
        (m.group(3), form) for form in forms[m.group(2)]]
    if m.group(1) in ["adj", "partadj"]:
      adjinfltext = """===Adjective===
{{head|ru|adjective form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      adjinfltext = ""
    if m.group(1) in ["part", "partadj"]:
      partinfltext = """===Participle===
{{head|ru|participle form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      partinfltext = ""
    adjformtext = partinfltext + adjinfltext
    etymtext = ""
  else:
    if etym.startswith("acr:"):
      _, fullexpr, meaning = do_split(":", etym)
      etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning)
    elif etym.startswith("deverb:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm
    elif etym.startswith("back:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "{{back-form|lang=ru|%s}}" % sourceterm
    elif etym.startswith("raw:"):
      etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym))
    elif ":" in etym and "+" not in etym:
      if etym.startswith("?"):
        prefix = "Perhaps borrowed from "
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately borrowed from "
        etym = re.sub(r"^<<", "", etym)
      else:
        prefix = "Borrowed from "
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if not m:
        error("Bad etymology form: %s" % etym)
      etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2))
    else:
      prefix = ""
      suffix = ""
      if etym.startswith("?"):
        prefix = "Perhaps from "
        suffix = "."
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately from "
        suffix = "."
        etym = re.sub(r"^<<", "", etym)
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if m:
        langtext = "|lang1=%s" % m.group(1)
        etym = m.group(2)
      else:
        langtext = ""
      etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix,
          "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext,
          suffix)
    etymbody = etymtext + "\n\n"
    etymtext = "===Etymology===\n" + etymbody

  if not etymtext:
    pagemsg("No etymology text, skipping")

  # Load page
  page = pywikibot.Page(site, pagetitle)

  if not blib.try_repeatedly(lambda: page.exists(), pagemsg,
      "check page existence"):
    pagemsg("Page doesn't exist, can't add etymology")
    return
    
  pagemsg("Adding etymology")
  notes = []
  pagetext = unicode(page.text)

  # Split into sections
  splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M)
  # Extract off pagehead and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Go through each section in turn, looking for existing Russian section
  for i in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[i], re.M)
    if not m:
      pagemsg("Can't find language name in text: [[%s]]" % (sections[i]))
    elif m.group(1) == "Russian":
      if override_etym:
        subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)

        replaced_etym = False
        for j in xrange(2, len(subsections), 2):
          if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]:
            subsections[j] = etymbody
            replaced_etym = True
            break

        if replaced_etym:
          sections[i] = "".join(subsections)
          newtext = "".join(sections)
          notes.append("replace Etymology section in Russian lemma with manually specified etymology")
          break

      if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]:
        errandpagemsg("WARNING: Already found etymology, skipping")
        return

      subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)
          
      insert_before = 1
      if "===Alternative forms===" in subsections[insert_before]:
        insert_before += 2

      subsections[insert_before] = etymtext + subsections[insert_before]
      sections[i] = "".join(subsections)
      if add_passive_of:
        active_term = rulib.remove_monosyllabic_accents(
          re.sub(u"с[яь]$", "", accented_term))
        sections[i] = re.sub(r"(^(#.*\n)+)",
          r"\1# {{passive of|lang=ru|%s}}\n" % active_term,
          sections[i], 1, re.M)

      newtext = pagehead + "".join(sections)
      notes.append("add (manually specified) Etymology section to Russian lemma")
      break
  else:
    errandpagemsg("WARNING: Can't find Russian section, skipping")
    return

  if newtext != pagetext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext))
    assert notes
    comment = "; ".join(group_notes(notes))
    if save:
      blib.safe_page_save(page, comment, errandpagemsg)
    else:
      pagemsg("Would save with comment = %s" % comment)
Exemplo n.º 14
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    override_pos = pages_pos.get(pagetitle, None)
    if override_pos:
        del pages_pos[pagetitle]

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping page")
        return

    titlewords = split_words(pagetitle, True)
    saw_e = False
    for word in titlewords:
        if word.endswith(u"е") and not rulib.is_monosyllabic(word):
            saw_e = True
            break
    if not saw_e:
        pagemsg(u"No possible final unstressed -е in page title, skipping")
        return

    #if (" " in pagetitle or "-" in pagetitle) and not override_pos:
    #  pagemsg(u"WARNING: Space or hyphen in page title and probable final unstressed -е, not sure how to handle yet")
    #  return

    text = unicode(page.text)
    notes = []

    foundrussian = False
    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    for j in xrange(2, len(sections), 2):
        if sections[j - 1] == "==Russian==\n":
            if foundrussian:
                pagemsg(
                    "WARNING: Found multiple Russian sections, skipping page")
                return
            foundrussian = True

            subsections = re.split(
                "(^===(?:Etymology|Pronunciation) [0-9]+===\n)", sections[j],
                0, re.M)
            # If no separate etymology sections, add extra stuff at the beginning
            # to fit the pattern
            if len(subsections) == 1:
                subsections = ["", ""] + subsections

            subsections_with_ru_ipa_to_fix = set()
            subsections_with_ru_ipa = set()
            for k in xrange(0, len(subsections), 2):
                for t in blib.parse_text(subsections[k]).filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        subsections_with_ru_ipa.add(k)
                        if getparam(t, "pos"):
                            pagemsg(
                                "Already has pos=, skipping template in section %s: %s"
                                % (k // 2, unicode(t)))
                        else:
                            phon = (getparam(t, "phon") or getparam(t, "1")
                                    or pagetitle).lower()
                            phonwords = split_words(phon, True)
                            if len(phonwords) != len(titlewords):
                                pagemsg(
                                    "WARNING: #Words (%s) in phon=%s not same as #words (%s) in title"
                                    % ((len(phonwords) + 1) // 2, phon,
                                       (len(titlewords) + 1) // 2))
                                for i in xrange(0, len(phonwords), 2):
                                    phonword = phonwords[i]
                                    wordno = i // 2 + 1
                                    if rulib.is_monosyllabic(phonword):
                                        pagemsg(
                                            "Skipping monosyllabic pronun %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                    elif not phonword.endswith(u"е"):
                                        pagemsg(
                                            u"Skipping pronun word %s (#%s) in section %s because doesn't end in -е"
                                            % (phonword, wordno, k // 2))
                                    else:
                                        pagemsg(
                                            "Found template that will be modified due to phonword %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                        subsections_with_ru_ipa_to_fix.add(k)
                            else:
                                for i in xrange(0, len(phonwords), 2):
                                    titleword = titlewords[i]
                                    phonword = phonwords[i]
                                    wordno = i // 2 + 1
                                    if rulib.is_monosyllabic(phonword):
                                        pagemsg(
                                            "Skipping monosyllabic pronun %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                    elif not titleword.endswith(u"е"):
                                        pagemsg(
                                            u"Skipping title word %s (#%s) in section %s because doesn't end in -е"
                                            % (titleword, wordno, k // 2))
                                    elif re.search(
                                            u"([еия]|цы|е̂|[кгхцшжщч]а)" +
                                            rulib.DOTABOVE + "?$", phonword):
                                        pagemsg(
                                            "Found template that will be modified due to phonword %s, titleword %s (#%s) in section %s: %s"
                                            % (phonword, titleword, wordno,
                                               k // 2, unicode(t)))
                                        subsections_with_ru_ipa_to_fix.add(k)
                                    elif not re.search(
                                            u"[еэѐ][" + rulib.AC + rulib.GR +
                                            rulib.CFLEX + rulib.DUBGR + "]?$",
                                            phonword):
                                        pagemsg(
                                            u"WARNING: ru-IPA pronunciation word %s (#%s) doesn't end in [еэия] or е̂ or hard sibilant + [ыа] when corresponding titleword %s ends in -е, something wrong in section %s: %s"
                                            % (phonword, wordno, titleword,
                                               k // 2, unicode(t)))
                                    else:
                                        pagemsg(
                                            u"Pronun word %s (#%s) with final -э or stressed vowel, ignoring in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))

            if not subsections_with_ru_ipa:
                pagemsg("No ru-IPA on page, skipping page")
                return
            if not subsections_with_ru_ipa_to_fix:
                pagemsg("No fixable ru-IPA on page, skipping page")
                return

            # If saw ru-IPA covering multiple etym sections, make sure we don't
            # also have pronuns inside the etym sections, and then treat as one
            # single section for the purposes of finding POS's
            if 0 in subsections_with_ru_ipa:
                if len(subsections_with_ru_ipa) > 1:
                    pagemsg(
                        "WARNING: Saw ru-IPA in section 0 (covering multiple etym or pronun sections) and also inside etym/pronun section(s) %s; skipping page"
                        %
                        (",".join(k // 2
                                  for k in subsections_with_ru_ipa if k > 0)))
                    return
                subsections = ["", "", "".join(subsections)]
                subsections_with_ru_ipa_to_fix = {2}

            for k in subsections_with_ru_ipa_to_fix:
                pagemsg("Fixing section %s" % (k // 2))
                parsed = blib.parse_text(subsections[k])

                if override_pos:
                    pos = override_pos
                else:
                    pos = set()
                    is_lemma = set()
                    lemma = set()
                    saw_acc = False
                    saw_noun_form = False
                    for t in parsed.filter_templates():

                        def getp(param):
                            return getparam(t, param)

                        tname = unicode(t.name)
                        if tname in ["ru-noun", "ru-proper noun"]:
                            if getparam(t, "2") == "-":
                                pagemsg("Found invariable noun: %s" %
                                        unicode(t))
                                pos.add("inv")
                            else:
                                pagemsg("Found declined noun: %s" % unicode(t))
                                pos.add("n")
                            is_lemma.add(True)
                        elif tname in ["ru-noun+", "ru-proper noun+"]:
                            for param in t.params:
                                if re.search("^[0-9]+$", unicode(
                                        param.name)) and "+" in unicode(
                                            param.value):
                                    pagemsg(
                                        "Found declined adjectival noun, treating as adjective: %s"
                                        % unicode(t))
                                    pos.add("a")
                                    break
                            else:
                                pagemsg("Found declined noun: %s" % unicode(t))
                                pos.add("n")
                            is_lemma.add(True)
                        elif tname == "comparative of" and getp(
                                "lang") == "ru":
                            pagemsg("Found comparative: %s" % unicode(t))
                            pos.add("com")
                            is_lemma.add(False)
                        elif tname == "ru-adv":
                            pagemsg("Found adverb: %s" % unicode(t))
                            pos.add("adv")
                            is_lemma.add(True)
                        elif tname == "ru-adj":
                            pagemsg("Found adjective: %s" % unicode(t))
                            pos.add("a")
                            is_lemma.add(True)
                        elif tname == "ru-noun form":
                            pagemsg("Found noun form: %s" % unicode(t))
                            saw_noun_form = True
                            is_lemma.add(False)
                        elif tname == "head" and getp("1") == "ru":
                            if getp("2") == "verb form":
                                pagemsg("Found verb form: %s" % unicode(t))
                                pos.add("v")
                                is_lemma.add(False)
                            elif getp("2") in [
                                    "adjective form", "participle form"
                            ]:
                                pagemsg("Found adjective form: %s" %
                                        unicode(t))
                                pos.add("a")
                                is_lemma.add(False)
                            elif getp("2") == "noun form":
                                pagemsg("Found noun form: %s" % unicode(t))
                                saw_noun_form = True
                                is_lemma.add(False)
                            elif getp("2") == "pronoun form":
                                pagemsg("Found pronoun form: %s" % unicode(t))
                                pos.add("pro")
                                is_lemma.add(False)
                            elif getp("2") == "preposition":
                                pagemsg("Found preposition: %s" % unicode(t))
                                pos.add("p")
                                is_lemma.add(True)
                            elif getp("2") == "numeral":
                                pagemsg("Found numeral: %s" % unicode(t))
                                pos.add("num")
                                is_lemma.add(True)
                            elif getp("2") == "pronoun":
                                pagemsg("Found pronoun: %s" % unicode(t))
                                pos.add("pro")
                                is_lemma.add(True)
                        elif tname == "inflection of" and getp("lang") == "ru":
                            is_lemma.add(False)
                            lemma.add(rulib.remove_accents(getp("1")))
                            if saw_noun_form:
                                inflection_groups = []
                                inflection_group = []
                                for param in t.params:
                                    if param.name in ["1", "2"]:
                                        continue
                                    val = unicode(param.value)
                                    if val == ";":
                                        if inflection_group:
                                            inflection_groups.append(
                                                inflection_group)
                                            inflection_group = []
                                    else:
                                        inflection_group.append(val)
                                if inflection_group:
                                    inflection_groups.append(inflection_group)
                                for igroup in inflection_groups:
                                    igroup = set(igroup)
                                    is_plural = not not ({"p", "plural"}
                                                         & igroup)
                                    if is_plural and ({"nom", "nominative"}
                                                      & igroup):
                                        pagemsg(
                                            "Found nominative plural case inflection: %s"
                                            % unicode(t))
                                        pos.add("nnp")
                                    elif {"acc", "accusative"} & igroup:
                                        # We use "n" for misc cases, but skip accusative for now,
                                        # adding "n" later if we haven't seen nnp to avoid problems
                                        # below with the check for multiple pos's (nom pl and acc pl
                                        # are frequently the same)
                                        saw_acc = True
                                    elif not is_plural and (
                                        {"pre", "prep", "prepositional"}
                                            & igroup):
                                        pagemsg(
                                            "Found prepositional singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("pre")
                                    elif not is_plural and ({"dat", "dative"}
                                                            & igroup):
                                        pagemsg(
                                            "Found dative singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("dat")
                                    elif not is_plural and (
                                        {"loc", "locative"} & igroup):
                                        pagemsg(
                                            "Found locative singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("dat")
                                    elif not is_plural and (
                                        {"voc", "vocative"} & igroup):
                                        pagemsg(
                                            "Found vocative case inflection: %s"
                                            % unicode(t))
                                        pos.add("voc")
                                    else:
                                        pos.add("n")
                        elif tname == "prepositional singular of" and getp(
                                "lang") == "ru":
                            pagemsg(
                                "Found prepositional singular case inflection: %s"
                                % unicode(t))
                            pos.add("pre")
                            is_lemma.add(False)
                            lemma.add(getp("1"))
                        elif tname == "dative singular of" and getp(
                                "lang") == "ru":
                            pagemsg(
                                "Found dative singular case inflection: %s" %
                                unicode(t))
                            pos.add("dat")
                            is_lemma.add(False)
                            lemma.add(getp("1"))
                        elif tname == "vocative singular of" and getp(
                                "lang") == "ru":
                            pagemsg("Found vocative case inflection: %s" %
                                    unicode(t))
                            pos.add("voc")
                            is_lemma.add(False)
                            lemma.add(getp("1"))

                    if saw_acc and "nnp" not in pos:
                        pos.add("n")
                    if "dat" in pos and "pre" in pos:
                        pagemsg("Removing pos=dat because pos=pre is found")
                        pos.remove("dat")
                    if "com" in pos:
                        if "a" in pos:
                            pagemsg("Removing pos=a because pos=com is found")
                            pos.remove("a")
                        if "adv" in pos:
                            pagemsg(
                                "Removing pos=adv because pos=com is found")
                            pos.remove("adv")
                    if "a" in pos and "nnp" in pos:
                        pagemsg("Removing pos=nnp because pos=a is found")
                        pos.remove("nnp")
                    if not pos:
                        pagemsg(
                            "WARNING: Can't locate any parts of speech, skipping section"
                        )
                        continue
                    if len(pos) > 1:
                        pagemsg(
                            "WARNING: Found multiple parts of speech, skipping section: %s"
                            % ",".join(pos))
                        continue
                    pos = list(pos)[0]

                    # If multiword term or potential adjectival term, can't trust
                    # the part of speech coming from the above process
                    if (" " in pagetitle or "-" in pagetitle
                            or re.search(u"[ыиео]́?е$", pagetitle)):
                        if not is_lemma:
                            pagemsg(
                                "WARNING: Can't determine whether lemma or not, skipping section"
                            )
                            continue
                        if len(is_lemma) > 1:
                            pagemsg(
                                "WARNING: Found both lemma and non-lemma parts of speech, skipping section"
                            )
                            continue
                        is_lemma = list(is_lemma)[0]
                        if (" " in pagetitle or "-" in pagetitle) and is_lemma:
                            pagemsg(
                                u"WARNING: Space or hyphen in lemma page title and probable final unstressed -e, not sure how to handle yet, skipping section"
                            )
                            continue
                        # If is_lemma, we are a single-word adjective and will be handled
                        # correctly by the above code
                        if not is_lemma:
                            if not lemma:
                                pagemsg(
                                    "WARNING: Non-lemma form and can't determine lemma, skipping section"
                                )
                                continue
                            if len(lemma) > 1:
                                pagemsg(
                                    "WARNING: Found inflections of multiple lemmas, skipping section: %s"
                                    % ",".join(lemma))
                                continue
                            lemma = list(lemma)[0]
                            retval = find_noun_word_types(lemma, pagemsg)
                            if not retval:
                                continue
                            word_types, seen_pos_specs = retval
                            words = split_words(pagetitle, False)
                            assert len(words) == len(word_types)
                            modified_word_types = []
                            need_to_continue = False
                            # FIXME: Should we be using phonetic version of lemma?
                            for wordno, (word, ty) in enumerate(
                                    zip(words, word_types)):
                                if word.endswith(
                                        u"е"
                                ) and not rulib.is_monosyllabic(word):
                                    if ty == "inv":
                                        if len(seen_pos_specs) > 1:
                                            pagemsg(
                                                u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has ambiguous pos= params (%s), not sure what to do, skipping section"
                                                % (pagetitle, word,
                                                   ",".join(seen_pos_specs)))
                                            need_to_continue = True
                                            break
                                        elif not seen_pos_specs:
                                            pagemsg(
                                                u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has no pos= params, not sure what to do, skipping section"
                                                % (pagetitle, word))
                                            need_to_continue = True
                                            break
                                        else:
                                            seen_pos_spec = list(
                                                seen_pos_specs)[0]
                                            seen_poses = re.split(
                                                "/", seen_pos_spec)
                                            if len(seen_poses) == 1:
                                                ty = seen_poses[0]
                                            elif len(words) != len(seen_poses):
                                                pagemsg(
                                                    u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma param pos=%s has wrong number of parts of speech, not sure what to do, skipping section"
                                                    % (pagetitle, word,
                                                       seen_pos_spec))
                                                need_to_continue = True
                                                break
                                            else:
                                                ty = seen_poses[wordno]
                                                if not ty:
                                                    pagemsg(
                                                        "WARNING: Something wrong with retrieved pos= value from lemma, has blank value"
                                                    )
                                                    need_to_continue = True
                                                    break
                                    if ty == "decln":
                                        modified_word_types.append(pos)
                                    else:
                                        modified_word_types.append(ty)
                                else:
                                    modified_word_types.append("")
                            if need_to_continue:
                                continue
                            non_blank_distinct_mwt = set(
                                x for x in modified_word_types if x)
                            if len(non_blank_distinct_mwt) == 0:
                                pagemsg(
                                    "WARNING: Something wrong, pos= would end up blank"
                                )
                            elif len(non_blank_distinct_mwt) == 1:
                                pos = list(non_blank_distinct_mwt)[0]
                            else:
                                pos = "/".join(modified_word_types)

                # Check whether there's a pronunciation with final -е for a given
                # word. There are some entries that have multiple pronunciations,
                # one with final -е and one with something else, e.g. final -и,
                # and we want to leave those alone with a warning.
                saw_final_e = {}
                for t in parsed.filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        param = "phon"
                        phon = getparam(t, param)
                        if not phon:
                            param = "1"
                            phon = getparam(t, "1")
                            if not phon:
                                param = "pagetitle"
                                phon = pagetitle
                        if getparam(t, "pos"):
                            pass  # Already output msg
                        else:
                            phonwords = split_words(phon, True)
                            for i in xrange(0, len(phonwords), 2):
                                if re.search(u"е$", phonwords[i]):
                                    saw_final_e[i] = True

                # Now modify the templates.
                for t in parsed.filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        param = "phon"
                        phon = getparam(t, param)
                        if not phon:
                            param = "1"
                            phon = getparam(t, "1")
                            if not phon:
                                param = "pagetitle"
                                phon = pagetitle
                        origt = unicode(t)
                        if getparam(t, "pos"):
                            pass  # Already output msg
                        else:
                            phonwords = split_words(phon, True)
                            mismatched_phon_title = len(phonwords) != len(
                                titlewords)
                            for i in xrange(0, len(phonwords), 2):
                                titleword = not mismatched_phon_title and titlewords[
                                    i]
                                phonword = phonwords[i]
                                lphonword = phonword.lower()
                                wordno = i // 2 + 1

                                if rulib.is_monosyllabic(phonword):
                                    pass  # Already output msg
                                elif mismatched_phon_title:
                                    pass  # Can't canonicalize template
                                elif not titleword.endswith(u"е"):
                                    pass  # Already output msg
                                elif re.search(
                                        u"([еия]|цы|е̂|[кгхцшжщч]а)" +
                                        rulib.DOTABOVE + "?$", lphonword):
                                    # Found a template to modify
                                    if re.search(u"е" + rulib.DOTABOVE + "?$",
                                                 lphonword):
                                        pass  # No need to canonicalize
                                    else:
                                        if saw_final_e.get(i, False):
                                            pagemsg(
                                                u"WARNING: Found another pronunciation with final -е, skipping: phon=%s (word #%s)"
                                                % (phonword, wordno))
                                            continue
                                        if re.search(
                                                u"и" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in -и, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -и -> -е")
                                        elif re.search(u"е̂$", lphonword):
                                            # Make this a warning because we're not sure this is correct
                                            pagemsg(
                                                u"WARNING: phon=%s (word #%s) ends in -е̂, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(u"-е̂ -> -е")
                                        elif re.search(
                                                u"я" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in -я, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -я -> -е")
                                        elif re.search(
                                                u"цы" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in ц + -ы, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -ы after ц -> -е")
                                        elif re.search(
                                                u"[кгхцшжщч]а" +
                                                rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in unpaired cons + -а, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -а after unpaired cons -> -е"
                                            )
                                        else:
                                            assert False, "Something wrong, strange ending, logic not correct: section %s, phon=%s (word #%s)" % (
                                                k // 2, phonword, wordno)
                                        newphonword = re.sub(
                                            u"(?:[ияыа]|е̂)(" +
                                            rulib.DOTABOVE + "?)$", ur"е\1",
                                            phonword)
                                        newphonword = re.sub(
                                            u"(?:[ИЯЫА]|Е̂)(" +
                                            rulib.DOTABOVE + "?)$", ur"Е\1",
                                            newphonword)
                                        pagemsg(
                                            "Modified phon=%s (word #%s) to %s in section %s: %s"
                                            % (phonword, wordno, newphonword,
                                               k // 2, unicode(t)))
                                        phonwords[i] = newphonword
                            newphon = "".join(phonwords)
                            if newphon != phon:
                                assert param != "pagetitle", u"Something wrong, page title should not have -и or similar that needs modification: section %s, phon=%s, newphon=%s" % (
                                    k // 2, phon, newphon)
                                if pos in ["voc", "inv", "pro"]:
                                    pagemsg(
                                        u"WARNING: pos=%s may be unstable or inconsistent in handling final -е, please check change of phon=%s to %s in section %s: %s"
                                        % (pos, phon, newphon, k // 2,
                                           unicode(t)))
                                pagemsg(
                                    "Modified phon=%s to %s in section %s: %s"
                                    % (phon, newphon, k // 2, unicode(t)))
                                if pos == "none":
                                    pagemsg(
                                        "WARNING: pos=none, should not occur, not modifying phon=%s to %s in section %s: %s"
                                        % (phon, newphon, k // 2, unicode(t)))
                                else:
                                    t.add(param, newphon)

                            if pos == "none":
                                pagemsg(
                                    "WARNING: pos=none, should not occur, not setting pos= in section %s: %s"
                                    % (k // 2, unicode(t)))
                            else:
                                t.add("pos", pos)
                                notes.append(
                                    "added pos=%s%s" %
                                    (pos, override_pos and " (override)"
                                     or ""))
                                pagemsg(
                                    "Replaced %s with %s in section %s%s" %
                                    (origt, unicode(t), k // 2, override_pos
                                     and " (using override)" or ""))
                subsections[k] = unicode(parsed)
            sections[j] = "".join(subsections)

    new_text = "".join(sections)

    def fmt_key_val(key, val):
        if val == 1:
            return "%s" % key
        else:
            return "%s (%s)" % (key, val)

    if new_text != text:
        assert notes
        # Group identical notes together and append the number of such identical
        # notes if > 1, putting 'added pos=X' notes before others, so we get e.g.
        # "added pos=n (2); added pos=a; unstressed -и -> -е (2)" from five
        # original notes.
        # 1. Count items in notes[] and return a key-value list in descending order
        notescount = Counter(notes).most_common()
        # 2. Extract 'added pos=X' items; we put them first; note, descending order
        #    of # of times each note has been seen is maintained
        added_pos = [(x, y) for x, y in notescount
                     if x.startswith("added pos=")]
        # 3. Extract other items
        not_added_pos = [(x, y) for x, y in notescount
                         if not x.startswith("added pos=")]
        # 4. Recreate notes for 'added pos=X', then others
        notes = [fmt_key_val(x, y) for x, y in added_pos]
        notes.extend([fmt_key_val(x, y) for x, y in not_added_pos])

        return new_text, notes
Exemplo n.º 15
0
    els = do_split(r"\s+", line)

    if len(els) == 2 and els[1].startswith("altyo:"):
        altyoparts = do_split(":", els[1])
        if len(altyoparts) != 3:
            error("Expected verb and aspect with altyo:")
        yoline = u"{{ru-verb-alt-ё|%s|%s}}" % (altyoparts[1], altyoparts[2])
        msg("""%s

==Russian==

===Verb===
%s


""" % (rulib.remove_accents(altyoparts[1]).replace(u"ё", u"е"), yoline))
        continue

    # Replace _ with space, but not in the conjugation, where param names
    # may well have an underscore in them; but allow \s to stand for a space in
    # the conjugation, and \u to stand for an underscore elsewhere.
    els = [
        el.replace(r"\s", " ") if i == 4 else el.replace("_", " ").replace(
            r"\u", "_") for i, el in enumerate(els)
    ]
    if len(els) < 5:
        error("Expected five fields, saw only %s" % len(els))
    verb, etym, aspect, corverbs, conj = els[0], els[1], els[2], els[3], els[4]
    translit = None
    declverb = verb
    if "//" in verb:
Exemplo n.º 16
0
def normalize_text(text):
    return rulib.remove_accents(blib.remove_links(text)).replace("'''", "")
Exemplo n.º 17
0
parser.add_argument("--nouns",
                    action='store_true',
                    help="Do derived nouns instead of adjectives")
parser.add_argument("--adverbs",
                    action='store_true',
                    help="Do derived adverbs")
parser.add_argument("--base-lemmafile", help="File containing base lemmas")
parser.add_argument("--derived-lemmafile",
                    help="File containing derived lemmas")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

derived_lemmas = []
if args.derived_lemmafile:
    derived_lemmas = [
        rulib.remove_accents(x.strip())
        for x in codecs.open(args.derived_lemmafile, "r", "utf-8")
    ]
else:
    for i, page in blib.cat_articles(
            "Russian adverbs" if args.adverbs else "Russian nouns" if args.
            nouns else "Russian adjectives"):
        derived_lemmas.append(page.title())

if args.base_lemmafile:
    for i, pagename in blib.iter_items([
            rulib.remove_accents(x.strip())
            for x in codecs.open(args.base_lemmafile, "r", "utf-8")
    ]):
        page = pywikibot.Page(site, pagename)
        process_page(i, page, args.save, args.verbose, derived_lemmas)
Exemplo n.º 18
0
def process_page(index, page, lemmas):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    pagetext = unicode(page.text)

    section = blib.find_lang_section_from_text(pagetext, "Russian", pagemsg)
    if not section:
        errandpagemsg("WARNING: Couldn't find Russian section")
        return

    if "==Etymology" in section:
        return
    if rulib.check_for_alt_yo_terms(section, pagemsg):
        return
    parsed = blib.parse_text(section)
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-participle of"]:
            pagemsg("Skipping participle")
            return
    saw_verb = False
    saw_passive = False
    saw_bad_passive = False
    for t in parsed.filter_templates():
        if unicode(t.name) in ["passive of", "passive form of"]:
            saw_passive = True
    if not saw_passive and ("passive of" in section
                            or "passive form of" in section):
        saw_bad_passive = True
    splits = []
    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-verb":
            saw_verb = True
            saw_paired_verb = False
            printed_msg = False
            heads = blib.fetch_param_chain(t, "1", "head") or [pagetitle]
            refl = heads[0].endswith(u"ся") or heads[0].endswith(u"сь")
            if refl:
                m = re.search(u"^(.*)(с[яь])$", heads[0])
                assert m
                transverb_no_passive = (False if
                                        (saw_passive or saw_bad_passive) else
                                        is_transitive_verb(
                                            rulib.remove_accents(m.group(1)),
                                            pagemsg, errandpagemsg))
                if (saw_passive or saw_bad_passive or transverb_no_passive):
                    splits.append(
                        (heads, [m.group(1)
                                 ], "%s+-%s" % (m.group(1), m.group(2)),
                         "active-passive%s%s" %
                         (saw_bad_passive and " (saw-bad-passive)" or "",
                          transverb_no_passive and " (missing-passive-decl)"
                          or "")))
                    continue
            if getparam(t, "2").startswith("impf"):
                pfs = blib.fetch_param_chain(t, "pf", "pf")
                for otheraspect in pfs:
                    if heads[0][0:2] == otheraspect[0:2]:
                        saw_paired_verb = True
                if saw_paired_verb:
                    splits.append((heads, pfs, ",".join(pfs), "paired-impf"))
                    printed_msg = True
            if getparam(t, "2").startswith("pf"):
                prefixes = [
                    u"взъ", u"вз", u"вс", u"возъ", u"воз", u"вос", u"вы́",
                    u"въ", u"в", u"до", u"за", u"изъ", u"из", u"ис", u"на",
                    u"объ", u"об", u"отъ", u"от", u"о", u"пере", u"подъ",
                    u"под", u"по", u"предъ", u"пред", u"пре", u"при", u"про",
                    u"разъ", u"раз", u"рас", u"съ", u"с", u"у"
                ]
                for break_reflexives in [False, True]:
                    head = heads[0]
                    if break_reflexives:
                        if not head.endswith(u"ся") and not head.endswith(
                                u"сь"):
                            break
                        reflsuf = "+-" + head[-2:]  # fetch reflexive suffix
                        head = head[:-2]  # drop reflexive suffix
                    else:
                        reflsuf = ""
                    for prefix in prefixes:
                        m = re.match("^(%s)(.*)$" % prefix, head)
                        if m:
                            base = rulib.remove_monosyllabic_accents(
                                re.sub(u"^ы", u"и", m.group(2)))
                            if rulib.remove_accents(base) in lemmas:
                                base_to_do = base
                            elif rulib.remove_accents("-" + base) in lemmas:
                                base_to_do = "-" + base
                            else:
                                base_to_do = None
                            if base_to_do:
                                prefix = prefix.replace(u"ъ", "")
                                if m.group(1) == u"вы́":
                                    need_accent = "-NEED-ACCENT"
                                else:
                                    need_accent = ""
                                splits.append((
                                    heads, [base_to_do], "%s-+%s%s%s" %
                                    (prefix, base_to_do, reflsuf, need_accent),
                                    "strip-prefix"))
                                printed_msg = True
            if not printed_msg:
                msg("%s no-etym misc" % ",".join(heads))
    for derived_terms, base_terms, analysis, comment in splits:
        warnings = []
        base_terms_no_accent = []
        for term in base_terms:
            term = rulib.remove_accents(term)
            if term not in base_terms_no_accent:
                base_terms_no_accent.append(term)
        if len(base_terms_no_accent) > 1:
            errandpagemsg(
                "WARNING: Multiple base pages %s for base lemmas %s" %
                (",".join(base_terms_no_accent), ",".join(base_terms)))
            continue
        if base_terms_no_accent[0] not in lemmas:
            continue
        derived_defns = rulib.find_defns(section)
        if not derived_defns:
            errandpagemsg(
                "WARNING: Couldn't find definitions for derived term %s" %
                ",".join(derived_terms))
            continue
        base_section = blib.find_lang_section(base_terms_no_accent[0],
                                              "Russian", pagemsg,
                                              errandpagemsg)
        if not base_section:
            errandpagemsg(
                "WARNING: Couldn't find Russian section for base term %s" %
                base_terms_no_accent[0])
            continue
        base_defns = rulib.find_defns(base_section)
        if not base_defns:
            errandpagemsg(
                "WARNING: Couldn't find definitions for base term %s" %
                ",".join(base_terms))
            continue

        def concat_defns(defns):
            return ";".join(defns).replace("_", r"\u").replace(" ", "_")

        msg("%s %s%s no-etym %s %s //// %s" %
            (",".join(derived_terms), analysis,
             " WARNING:%s" % ",".join(warnings) if warnings else "", comment,
             concat_defns(base_defns), concat_defns(derived_defns)))
    if not saw_verb:
        msg("%s no-etym misc" % pagetitle)
Exemplo n.º 19
0
    def find_decl_args(lemma, infl, wordind):
        declpage = pywikibot.Page(site, lemma)
        if rulib.remove_accents(infl) == lemma:
            wordlink = "[[%s]]" % infl
        else:
            wordlink = "[[%s|%s]]" % (lemma, infl)

        if not declpage.exists():
            if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
                pagemsg(
                    "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return [("1", wordlink), ("2", "+")], True, None, None
            else:
                pagemsg(
                    "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
        parsed = blib.parse_text(declpage.text)
        decl_templates = []
        headword_templates = []
        decl_z_templates = []
        for t in parsed.filter_templates():
            tname = unicode(t.name)
            if tname in ["ru-noun-table", "ru-decl-adj"]:
                pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
                decl_templates.append(t)
            if tname in ["ru-noun", "ru-proper noun"]:
                pagemsg("find_decl_args: Found headword template: %s" %
                        unicode(t))
                headword_templates.append(t)
            if tname in ["ru-decl-noun-z"]:
                pagemsg("find_decl_args: Found z-decl template: %s" %
                        unicode(t))
                decl_z_templates.append(t)

        if not decl_templates:
            if decl_z_templates:
                # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
                # {{ru-decl-noun-z|ёж|m-inan|b}}
                if len(decl_z_templates) > 1:
                    pagemsg(
                        "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None
                else:
                    decl_z_template = decl_z_templates[0]
                    headword_template = None
                    pagemsg("find_decl_args: Using z-decl template: %s" %
                            unicode(decl_z_template))
                    if len(headword_templates) == 0:
                        pagemsg(
                            "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    elif len(headword_templates) > 1:
                        pagemsg(
                            "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    else:
                        headword_template = headword_templates[0]
                        pagemsg(
                            "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s"
                            %
                            (wordind, lemma, infl, unicode(headword_template),
                             unicode(decl_z_template)))
                    decl_template = runounlib.convert_zdecl_to_ru_noun_table(
                        decl_z_template,
                        subpagetitle,
                        pagemsg,
                        headword_template=headword_template)
                    decl_templates = [decl_template]

            elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
                    x for x in headword_templates if getparam(x, "3") == "-"
            ]:
                return [("1", wordlink), ("2", "$")], False, None, None
            else:
                pagemsg(
                    "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        if len(decl_templates) == 1:
            decl_template = decl_templates[0]
        else:
            # Multiple decl templates
            for t in decl_templates:
                if unicode(t.name) == "ru-decl-adj" and re.search(
                        u"(ий|ый|ой)$", lemma):
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    decl_template = t
                    break
            else:
                if lemma in use_given_decl:
                    overriding_decl = use_given_decl[lemma]
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                        % (wordind, overriding_decl, lemma, infl))
                    decl_template = blib.parse_text(
                        overriding_decl).filter_templates()[0]
                elif pagetitle in use_given_page_decl:
                    overriding_decl = use_given_page_decl[pagetitle].get(
                        lemma, None)
                    if not overriding_decl:
                        pagemsg(
                            "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        return
                    else:
                        pagemsg(
                            "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                            % (wordind, overriding_decl, lemma, infl))
                        decl_template = blib.parse_text(
                            overriding_decl).filter_templates()[0]
                else:
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None

        pagemsg("find_decl_args: Using decl template: %s" %
                unicode(decl_template))
        if unicode(decl_template.name) == "ru-decl-adj":
            if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
                return [("1", wordlink), ("2", u"+ь")], True, None, None
            else:
                return [("1", wordlink), ("2", "+")], True, None, None
Exemplo n.º 20
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    origtext = page.text
    parsed = blib.parse_text(origtext)

    # Find the declension arguments for LEMMA and inflected form INFL,
    # the WORDINDth word in the expression. Return value is a tuple of
    # four items: a list of (NAME, VALUE) tuples for the arguments, whether
    # the word is an adjective, the value of n= (if given), and the value
    # of a= (if given).
    def find_decl_args(lemma, infl, wordind):
        declpage = pywikibot.Page(site, lemma)
        if rulib.remove_accents(infl) == lemma:
            wordlink = "[[%s]]" % infl
        else:
            wordlink = "[[%s|%s]]" % (lemma, infl)

        if not declpage.exists():
            if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
                pagemsg(
                    "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return [("1", wordlink), ("2", "+")], True, None, None
            else:
                pagemsg(
                    "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
        parsed = blib.parse_text(declpage.text)
        decl_templates = []
        headword_templates = []
        decl_z_templates = []
        for t in parsed.filter_templates():
            tname = unicode(t.name)
            if tname in ["ru-noun-table", "ru-decl-adj"]:
                pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
                decl_templates.append(t)
            if tname in ["ru-noun", "ru-proper noun"]:
                pagemsg("find_decl_args: Found headword template: %s" %
                        unicode(t))
                headword_templates.append(t)
            if tname in ["ru-decl-noun-z"]:
                pagemsg("find_decl_args: Found z-decl template: %s" %
                        unicode(t))
                decl_z_templates.append(t)

        if not decl_templates:
            if decl_z_templates:
                # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
                # {{ru-decl-noun-z|ёж|m-inan|b}}
                if len(decl_z_templates) > 1:
                    pagemsg(
                        "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None
                else:
                    decl_z_template = decl_z_templates[0]
                    headword_template = None
                    pagemsg("find_decl_args: Using z-decl template: %s" %
                            unicode(decl_z_template))
                    if len(headword_templates) == 0:
                        pagemsg(
                            "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    elif len(headword_templates) > 1:
                        pagemsg(
                            "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    else:
                        headword_template = headword_templates[0]
                        pagemsg(
                            "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s"
                            %
                            (wordind, lemma, infl, unicode(headword_template),
                             unicode(decl_z_template)))
                    decl_template = runounlib.convert_zdecl_to_ru_noun_table(
                        decl_z_template,
                        subpagetitle,
                        pagemsg,
                        headword_template=headword_template)
                    decl_templates = [decl_template]

            elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
                    x for x in headword_templates if getparam(x, "3") == "-"
            ]:
                return [("1", wordlink), ("2", "$")], False, None, None
            else:
                pagemsg(
                    "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        if len(decl_templates) == 1:
            decl_template = decl_templates[0]
        else:
            # Multiple decl templates
            for t in decl_templates:
                if unicode(t.name) == "ru-decl-adj" and re.search(
                        u"(ий|ый|ой)$", lemma):
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    decl_template = t
                    break
            else:
                if lemma in use_given_decl:
                    overriding_decl = use_given_decl[lemma]
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                        % (wordind, overriding_decl, lemma, infl))
                    decl_template = blib.parse_text(
                        overriding_decl).filter_templates()[0]
                elif pagetitle in use_given_page_decl:
                    overriding_decl = use_given_page_decl[pagetitle].get(
                        lemma, None)
                    if not overriding_decl:
                        pagemsg(
                            "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        return
                    else:
                        pagemsg(
                            "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                            % (wordind, overriding_decl, lemma, infl))
                        decl_template = blib.parse_text(
                            overriding_decl).filter_templates()[0]
                else:
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None

        pagemsg("find_decl_args: Using decl template: %s" %
                unicode(decl_template))
        if unicode(decl_template.name) == "ru-decl-adj":
            if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
                return [("1", wordlink), ("2", u"+ь")], True, None, None
            else:
                return [("1", wordlink), ("2", "+")], True, None, None

        # ru-noun-table
        assert unicode(decl_template.name) == "ru-noun-table"

        # Split out the arg sets in the declension and check the
        # lemma of each one, taking care to handle cases where there is no lemma
        # (it would default to the page name).

        highest_numbered_param = 0
        for p in decl_template.params:
            pname = unicode(p.name)
            if re.search("^[0-9]+$", pname):
                highest_numbered_param = max(highest_numbered_param,
                                             int(pname))

        # Now gather the numbered arguments into arg sets. Code taken from
        # ru-noun.lua.
        offset = 0
        arg_sets = []
        arg_set = []
        for i in xrange(1, highest_numbered_param + 2):
            end_arg_set = False
            val = getparam(decl_template, str(i))
            if i == highest_numbered_param + 1:
                end_arg_set = True
            elif val == "_" or val == "-" or re.search("^join:", val):
                pagemsg(
                    "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
            elif val == "or":
                end_arg_set = True

            if end_arg_set:
                arg_sets.append(arg_set)
                arg_set = []
                offset = i
            else:
                arg_set.append(val)

        canon_infl = rulib.remove_accents(infl).lower()
        canon_lemma = lemma.lower()
        ispl = False
        need_sc1 = False
        found_gender = None
        if canon_infl != canon_lemma:
            for sgend, plend, gender, is_sc1 in pl_data:
                if sgend:
                    check_sgend = sgend
                else:
                    check_sgend = consonant_re
                if re.search(check_sgend + "$",
                             canon_lemma) and canon_infl == re.sub(
                                 sgend + "$", plend, canon_lemma):
                    ispl = True
                    found_gender = gender
                    need_sc1 = is_sc1
                    break
            else:
                pagemsg(
                    "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        # Substitute the wordlink for any lemmas in the declension.
        # If plural, also add gender and verify special case (1) as necessary.
        # Concatenate all the numbered params, substituting the wordlink into
        # the lemma as necessary.
        numbered_params = []
        for arg_set in arg_sets:
            lemma_arg = 0
            if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]):
                lemma_arg = 1
            if len(arg_set) <= lemma_arg:
                arg_set.append("")
            arglemma = arg_set[lemma_arg]
            manualtr = ""
            if "//" in arglemma:
                arglemma, manualtr = re.search("^(.*?)(//.*?)$",
                                               arglemma).groups()
            if (not arglemma or arglemma.lower() == infl.lower()
                    or rulib.is_monosyllabic(infl)
                    and rulib.remove_accents(arglemma).lower()
                    == rulib.remove_accents(infl).lower() or ispl and
                    rulib.remove_accents(arglemma).lower() == lemma.lower()):
                arg_set[lemma_arg] = wordlink + manualtr
            else:
                pagemsg(
                    "WARNING: Can't sub word link %s into decl lemma %s%s" %
                    (wordlink, arg_set[lemma_arg], ispl and ", skipping"
                     or ""))
                if ispl:
                    return None

            if ispl:
                # Add the gender
                if len(arg_set) <= lemma_arg + 1:
                    arg_set.append("")
                declarg = arg_set[lemma_arg + 1]

                # First, sub in gender
                m = re.search("(3f|[mfn])", declarg)
                if found_gender == "mf":
                    if not m:
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    decl_gender = m.group(1)
                    if decl_gender == "n":
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    elif decl_gender in ["m", "3f"]:
                        pagemsg(
                            u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                            % (decl_gender, wordind, lemma, infl))
                    else:
                        assert gender == "f"
                        pagemsg(
                            u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        declarg = re.sub("f", "3f", declarg, 1)
                else:
                    if m:
                        decl_gender = m.group(1)
                        if decl_gender == found_gender:
                            pagemsg(
                                "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                                % (found_gender, wordind, lemma, infl))
                        else:
                            pagemsg(
                                "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s"
                                % (decl_gender, wordind, found_gender, lemma,
                                   infl))
                            declarg = re.sub("(3f|[mfn])", found_gender,
                                             declarg, 1)
                    else:
                        pagemsg(
                            "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s"
                            % (wordind, found_gender, lemma, infl))
                        declarg = found_gender + declarg

                # Now check special case 1
                if need_sc1 != ("(1)" in declarg):
                    if need_sc1:
                        pagemsg(
                            "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None
                    else:
                        pagemsg(
                            "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None

                arg_set[lemma_arg + 1] = declarg

            if numbered_params:
                numbered_params.append("or")
            numbered_params.extend(arg_set)

        # Now gather all params, including named ones.
        params = []
        params.extend(
            (str(i + 1), val)
            for i, val in zip(xrange(len(numbered_params)), numbered_params))
        num = None
        anim = None
        for p in decl_template.params:
            pname = unicode(p.name)
            val = unicode(p.value)
            if pname == "a":
                anim = val
            elif pname == "n":
                num = val
            elif pname == "notes":
                params.append((pname, val))
            elif pname == "title":
                pagemsg(
                    "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s"
                    % (wordind, lemma, infl, val))
            elif re.search("^[0-9]+$", pname):
                pass
            else:
                keepparam = True
                if pname == "loc":
                    if pagetitle in keep_locative:
                        pagemsg(
                            "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    else:
                        pagemsg(
                            "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                        keepparam = False
                if pname == "par":
                    pagemsg(
                        "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if pname == "voc":
                    pagemsg(
                        "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if keepparam:
                    if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
                        pagemsg(
                            u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    pname += str(wordind)
                    params.append((pname, val))
Exemplo n.º 21
0
    # has пистолет-пулемёт given as a single entry. We have a check below
    # to try to catch this case, because no inflected nouns will show up.
    for i in xrange(1, len(headwords_separators), 2):
        hword = headwords_separators[i]
        separator = headwords_separators[i + 1]
        if i < len(headwords_separators
                   ) - 2 and separator != " " and separator != "-":
            pagemsg(
                "WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>"
                % (wordind + 1, hword, separator))
            return
        # Canonicalize link in headword
        m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", hword)
        if m:
            lemma, infl = m.groups()
            lemma = rulib.remove_accents(re.sub("#Russian$", "", lemma))
            if lemma == rulib.remove_accents(infl):
                hword = "[[%s]]" % infl
            else:
                hword = "[[%s|%s]]" % (lemma, infl)
        headwords.append(hword)
        separators.append(separator)
        wordind += 1

    pagemsg("Found headwords: %s" % " @@ ".join(headwords))

    # Get headword genders (includes animacy and number)
    genders = blib.fetch_param_chain(headword_template, "2", "g")
    genders_include_pl = len([x
                              for x in genders if re.search(r"\bp\b", x)]) > 0
Exemplo n.º 22
0
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re, sys, codecs, argparse

from blib import msg, errmsg
import rulib

parser = argparse.ArgumentParser(
    description="Find lemmas which would have forms saved.")
parser.add_argument('--direcfile', help="File containing directives.")
args = parser.parse_args()

lemmas = set()

for line in codecs.open(args.direcfile, "r", "utf-8"):
    line = line.strip()
    if "Would save with comment" in line:
        m = re.search(
            "Would save with comment.* (?:of|dictionary form) (.*?)(,| after| before| \(add| \(modify| \(update|$)",
            line)
        if not m:
            errmsg("WARNING: Unable to parse line: %s" % line)
        else:
            lemmas.add(rulib.remove_accents(m.group(1)))
for lemma in sorted(lemmas):
    print lemma.encode('utf-8')
Exemplo n.º 23
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re, codecs, argparse
from blib import msg
import rulib

parser = argparse.ArgumentParser(description="Make bare and list versions of 10,000-word frequency list from the Internet.")
parser.add_argument('--file', help="File containing original list.")
args = parser.parse_args()

for line in codecs.open(args.file, "r", "utf-8"):
  line = line.strip()
  line = re.sub(" .*", "", line)
  line = rulib.remove_accents(line)
  if "/" in line:
    els = re.split("/", line)
    impf = els[0]
    msg(impf)
    for pf in els[1:]:
      if pf.endswith("-"):
        pf = re.sub("-$", impf, pf)
      msg(pf)
  else:
    msg(line)
Exemplo n.º 24
0
            assert pos in pos_to_full_pos
            fullpos = pos_to_full_pos[pos]
            if len(altyoparts) == 2:
                yoline = u"{{ru-pos-alt-ё|%s|%s}}" % (altyoparts[1],
                                                      fullpos.lower())
            else:
                error("With misc. part of speech, gender/aspect not supported")
        msg("""%s

==Russian==

===%s===
%s


""" % (rulib.remove_accents(altyoparts[1]).replace(
            u"ё", u"е"), pos_to_full_pos[pos], yoline))
        continue

    # Replace _ with space, but not in the declension, where there may be
    # an underscore, e.g. a|short_m=-; but allow \s to stand for a space in
    # the declension, and \u for underscore elsewhere
    els = [
        el.replace(r"\s", " ") if i == 2 and (pos in ["n", "pn", "adj"]) else
        el.replace("_", " ").replace(r"\u", "_") for i, el in enumerate(els)
    ]
    if pos not in ["n", "pn", "adj"]:
        term, etym, defns = els[0], els[1], els[2]
        remainder = els[3:]
    else:
        if len(els) < 4:
            error("Expected four fields, saw only %s" % len(els))
Exemplo n.º 25
0
def process_page(index, page, direc, delete_bad, fix_verbs, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  direc = direc.replace("3oa", u"3°a")
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj"]:
      conjtype = getparam(t, "1")
      if not conjtype.startswith("3olda"):
        continue
      if conjtype.startswith("3olda") and conjtype != "3olda":
        pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t))
        continue
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      oldargs = rulib.split_generate_args(result)
      rmparam(t, "6")
      rmparam(t, "5")
      rmparam(t, "4")
      t.add("1", direc)
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      if delete_bad:
        newargs = rulib.split_generate_args(result)
        for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short",
            "past_f_short", "past_n_short", "past_pl_short"]:
          oldforms = re.split(",", oldargs[form]) if form in oldargs else []
          newforms = re.split(",", newargs[form]) if form in newargs else []
          for oldform in oldforms:
            if oldform not in newforms:
              formpagename = rulib.remove_accents(oldform)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Attempt to delete dictionary form, skipping")
              else:
                text = unicode(formpage.text)
                if "Etymology 1" in text:
                  pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
                elif "----" in text:
                  pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename)
                else:
                  numinfls = len(re.findall(r"\{\{inflection of\|", text))
                  if numinfls < 1:
                    pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename)
                  elif numinfls > 1:
                    pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename)
                  else:
                    comment = "Delete erroneously created long form of %s" % pagetitle
                    pagemsg("Existing text for form %s: [[%s]]" % (
                      formpagename, text))
                    if save:
                      formpage.delete(comment)
                    else:
                      pagemsg("Would delete page %s with comment=%s" %
                          (formpagename, comment))

      notes.append("fix 3olda -> %s" % direc)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text and fix_verbs:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Exemplo n.º 26
0
def process_page(index, page, save, verbose, adverbs, all_derived_lemmas):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    # ending and whether final consonant is palatal
    endings = [
        (u"ывать", False),
        (u"ивать", False),
        (u"ать", False),
        (u"ять", True),
        (u"еть", True),
        (u"ить", True),
        (u"нуть", False),
        (u"ия", True),
        (u"ие", True),
        (u"я", True),
        (u"е", True),
        (u"ь", True),
        (u"и", True),
        (u"а", False),
        (u"о", False),
        (u"ы", False),
        (u"ый", False),
        (u"ий", True),
        (u"ой", False),
    ]
    stems = []
    for ending, is_palatal in endings:
        if pagetitle.endswith(ending):
            stem = re.sub(ending + "$", "", pagetitle)
            stems.append((stem, is_palatal))
    if not stems:
        stems.append((pagetitle, False))
    possible = []

    def append_possible(stem_to_try, suffix):
        possible.append((stem_to_try.lower() + suffix, suffix))

    # Try -ный/-ной, -ка, -ко
    for stem, palatal in stems:
        stems_to_try = []

        def frob(stem):
            stem = first_palatalization(stem)
            if stem.endswith(u"л"):
                stem += u"ь"
            if re.search("[" + rulib.vowel + "]$", stem):
                stem += u"й"
            return stem

        to_try_1 = frob(stem)
        to_try_2 = rulib.dereduce_stem(stem, False)
        if to_try_2:
            to_try_2 = frob(rulib.remove_accents(to_try_2))
        to_try_3 = rulib.dereduce_stem(stem, True)
        if to_try_3:
            to_try_3 = frob(rulib.remove_accents(to_try_3))
        stems_to_try.append(to_try_1)
        if to_try_2:
            stems_to_try.append(to_try_2)
        if to_try_3 and to_try_3 != to_try_2:
            stems_to_try.append(to_try_3)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"ный")
            append_possible(stem_to_try, u"ной")
            append_possible(stem_to_try, u"ский")
            append_possible(stem_to_try, u"ской")
            append_possible(stem_to_try, u"ник")
            append_possible(stem_to_try, u"чик")
            append_possible(stem_to_try, u"щик")
            append_possible(stem_to_try, u"ка")
            append_possible(stem_to_try, u"ко")
            append_possible(stem_to_try, u"ство")
    # Try -овый/-евый/-ёвый/-овой/-евой, -ик, -ок/-ек/-ёк
    for stem, palatal in stems:
        stems_to_try = []
        stems_to_try.append(stem)
        reduced = rulib.reduce_stem(stem)
        if reduced:
            stems_to_try.append(reduced)
        for stem_to_try in stems_to_try:
            if stem_to_try.endswith(u"й"):
                stem_to_try = stem_to_try[:-1]
            append_possible(stem_to_try, u"овый")
            append_possible(stem_to_try, u"евый")
            append_possible(stem_to_try, u"ёвый")
            append_possible(stem_to_try, u"овой")
            append_possible(stem_to_try, u"евой")
            stem_to_try = first_palatalization(stem_to_try)
            append_possible(stem_to_try, u"еский")
            append_possible(stem_to_try, u"ический")
            append_possible(stem_to_try, u"ество")
            append_possible(stem_to_try, u"ик")
            append_possible(stem_to_try, u"ок")
            append_possible(stem_to_try, u"ек")
            append_possible(stem_to_try, u"ёк")
            append_possible(stem_to_try, u"ец")
    # If derived adverbs, try -о, -е, -и
    if adverbs:
        for stem, palatal in stems:
            stems_to_try = []
            stems_to_try.append(stem)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"о")
            append_possible(stem_to_try, u"е")
            append_possible(stem_to_try, u"и")

    would_output = False
    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            would_output = True
    if not would_output:
        return

    text = unicode(page.text)

    if rulib.check_for_alt_yo_terms(text, pagemsg):
        return

    base_lemmas = []

    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            derived_section = blib.find_lang_section(possible_derived,
                                                     "Russian", pagemsg,
                                                     errandpagemsg)
            if not derived_section:
                errandpagemsg(
                    "WARNING: Couldn't find Russian section for derived term %s"
                    % possible_derived)
                continue
            if "==Etymology" in derived_section:
                pagemsg(
                    "Skipping derived term %s because it already has an etymology"
                    % possible_derived)
                continue
            derived_defns = rulib.find_defns(derived_section)
            if not derived_defns:
                errandpagemsg(
                    "WARNING: Couldn't find definitions for derived term %s" %
                    possible_derived)
                continue

            derived_parsed = blib.parse_text(derived_section)
            derived_lemmas = find_noun_lemmas(
                derived_parsed, possible_derived,
                errandpagemsg, lambda tempcall: blib.expand_text(
                    tempcall, possible_derived, pagemsg, verbose))
            for t in derived_parsed.filter_templates():
                if tname(t) in ["ru-adj", "ru-adv"]:
                    lemmas = blib.fetch_param_chain(t, "1", "head",
                                                    possible_derived)
                    trs = blib.fetch_param_chain(t, "tr", "tr")
                    if trs:
                        lemmas = [
                            "%s//%s" % (lemma, tr)
                            for lemma, tr in zip(lemmas, trs)
                        ]
                    for lemma in lemmas:
                        add_if_not(derived_lemmas, lemma)

            if not derived_lemmas:
                errandpagemsg("WARNING: No derived term lemmas for %s" %
                              possible_derived)
                return

            if not base_lemmas:
                base_parsed = blib.parse_text(text)
                base_lemmas = find_noun_lemmas(base_parsed, pagetitle,
                                               errandpagemsg, expand_text)

                for t in base_parsed.filter_templates():
                    if tname(t) in ["ru-verb", "ru-adj"]:
                        lemmas = blib.fetch_param_chain(
                            t, "1", "head", pagetitle)
                        trs = blib.fetch_param_chain(t, "tr", "tr")
                        if trs:
                            lemmas = [
                                "%s//%s" % (lemma, tr)
                                for lemma, tr in zip(lemmas, trs)
                            ]
                        for lemma in lemmas:
                            add_if_not(base_lemmas, lemma)

                if not base_lemmas:
                    errandpagemsg("WARNING: No base lemmas")
                    return

                base_lemmas = [
                    rulib.remove_monosyllabic_accents(x) for x in base_lemmas
                ]

                warnings = []
                if len(base_lemmas) > 1:
                    warnings.append("multiple-lemmas")
                if any("//" in lemma for lemma in base_lemmas):
                    warnings.append("translit-in-lemma")

                base_section = blib.find_lang_section_from_text(
                    text, "Russian", pagemsg)
                if not base_section:
                    errandpagemsg(
                        "WARNING: Couldn't find Russian section for base")
                    return

                base_defns = rulib.find_defns(base_section)
                if not base_defns:
                    errandpagemsg(
                        "WARNING: Couldn't find definitions for base")
                    return

            def concat_defns(defns):
                return ";".join(defns).replace("_", r"\u").replace(" ", "_")

            suffixes_with_stress = []
            for suf in [
                    suffix,
                    rulib.make_beginning_stressed_ru(suffix),
                    rulib.make_ending_stressed_ru(suffix)
            ]:
                for derived_lemma in derived_lemmas:
                    if derived_lemma.endswith(suf):
                        add_if_not(suffixes_with_stress, suf)
            msg("%s %s+-%s%s no-etym possible-suffixed %s //// %s" %
                (",".join(derived_lemmas), ",".join(base_lemmas),
                 ",".join(suffixes_with_stress),
                 " WARNING:%s" % ",".join(warnings) if warnings else "",
                 concat_defns(base_defns), concat_defns(derived_defns)))
Exemplo n.º 27
0
def process_page(index, page, direc, delete_bad, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  direc = direc.replace("3oa", u"3°a")
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj"]:
      conjtype = getparam(t, "1")
      if not conjtype.startswith("3olda"):
        continue
      if conjtype.startswith("3olda") and conjtype != "3olda":
        pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t))
        continue
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      oldargs = blib.split_generate_args(result)
      rmparam(t, "6")
      rmparam(t, "5")
      rmparam(t, "4")
      t.add("1", direc)
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      if delete_bad:
        newargs = blib.split_generate_args(result)
        for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short",
            "past_f_short", "past_n_short", "past_pl_short"]:
          oldforms = re.split(",", oldargs[form]) if form in oldargs else []
          newforms = re.split(",", newargs[form]) if form in newargs else []
          for oldform in oldforms:
            if oldform not in newforms:
              formpagename = rulib.remove_accents(oldform)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Attempt to delete dictionary form, skipping")
              else:
                text = unicode(formpage.text)
                if "Etymology 1" in text:
                  pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
                elif "----" in text:
                  pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename)
                else:
                  numinfls = len(re.findall(r"\{\{inflection of\|", text))
                  if numinfls < 1:
                    pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename)
                  elif numinfls > 1:
                    pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename)
                  else:
                    comment = "Delete erroneously created long form of %s" % pagetitle
                    pagemsg("Existing text for form %s: [[%s]]" % (
                      formpagename, text))
                    if save:
                      formpage.delete(comment)
                    else:
                      pagemsg("Would delete page %s with comment=%s" %
                          (formpagename, comment))

      notes.append("fix 3olda -> %s" % direc)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
Exemplo n.º 28
0
def process_page(index, num, save, verbose, params):
    comment = None
    notes = []

    lemma = ru_num(num)
    pagetitle = rulib.remove_accents(lemma)
    newtext = generate_page(num)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    # Prepare to create page
    pagemsg("Creating entry")
    page = pywikibot.Page(site, pagetitle)

    # If invalid title, don't do anything.
    existing_text = blib.safe_page_text(page,
                                        errandpagemsg,
                                        bad_value_ret=None)
    if existing_text is None:
        return

    if not blib.safe_page_exists(page, errandpagemsg):
        # Page doesn't exist. Create it.
        pagemsg("Creating page")
        comment = "Create page for Russian numeral %s (%s)" % (lemma, num)
        page.text = newtext
        if verbose:
            pagemsg("New text is [[%s]]" % page.text)
    else:  # Page does exist
        pagetext = existing_text

        # Split into sections
        splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M)
        # Extract off pagehead and recombine section headers with following text
        pagehead = splitsections[0]
        sections = []
        for i in xrange(1, len(splitsections)):
            if (i % 2) == 1:
                sections.append("")
            sections[-1] += splitsections[i]

        # Go through each section in turn, looking for existing Russian section
        for i in xrange(len(sections)):
            m = re.match("^==([^=\n]+)==$", sections[i], re.M)
            if not m:
                pagemsg("Can't find language name in text: [[%s]]" %
                        (sections[i]))
            elif m.group(1) == "Russian":
                # Extract off trailing separator
                mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S)
                if mm:
                    # Note that this changes the number of sections, which is seemingly
                    # a problem because the for-loop above calculates the end point
                    # at the beginning of the loop, but is not actually a problem
                    # because we always break after processing the Russian section.
                    sections[i:i + 1] = [mm.group(1), mm.group(2)]

                if params.overwrite_page:
                    if "==Etymology 1==" in sections[
                            i] and not params.overwrite_etymologies:
                        errandpagemsg(
                            "WARNING: Found ==Etymology 1== in page text, not overwriting, skipping form"
                        )
                        return
                    else:
                        pagemsg("WARNING: Overwriting entire Russian section")
                        comment = "Create Russian section for numeral %s (%s)" % (
                            lemma, num)
                        sections[i] = newtext
                        notes.append("overwrite section")
                        break
                else:
                    errandpagemsg(
                        "WARNING: Not overwriting existing Russian section")
                    return
            elif m.group(1) > "Russian":
                pagemsg("Exists; inserting before %s section" % (m.group(1)))
                comment = "Create Russian section and entry for numeral %s (%s); insert before %s section" % (
                    lemma, num, m.group(1))
                sections[i:i] = [newtext, "\n----\n\n"]
                break

        else:  # else of for loop over sections, i.e. no break out of loop
            pagemsg("Exists; adding section to end")
            comment = "Create Russian section and entry for numeral %s (%s); append at end" % (
                lemma, num)

            if sections:
                sections[-1] = ensure_two_trailing_nl(sections[-1])
                sections += ["----\n\n", newsection]
            else:
                if not params.overwrite_page:
                    notes.append("formerly empty")
                if pagehead.lower().startswith("#redirect"):
                    pagemsg("WARNING: Page is redirect, overwriting")
                    notes.append("overwrite redirect")
                    pagehead = re.sub(
                        r"#redirect *\[\[(.*?)\]\] *(<!--.*?--> *)*\n*",
                        r"{{also|\1}}\n", pagehead, 0, re.I)
                elif not params.overwrite_page:
                    pagemsg("WARNING: No language sections in current page")
                sections += [newsection]

        # End of loop over sections in existing page; rejoin sections
        newtext = pagehead + ''.join(sections)

        if page.text != newtext:
            assert comment or notes

        # Eliminate sequences of 3 or more newlines, which may come from
        # ensure_two_trailing_nl(). Add comment if none, in case of existing page
        # with extra newlines.
        newnewtext = re.sub(r"\n\n\n+", r"\n\n", newtext)
        if newnewtext != newtext and not comment and not notes:
            notes = ["eliminate sequences of 3 or more newlines"]
        newtext = newnewtext

        if page.text == newtext:
            pagemsg("No change in text")
        elif verbose:
            pagemsg("Replacing <%s> with <%s>" % (page.text, newtext))
        else:
            pagemsg("Text has changed")
        page.text = newtext

    # Executed whether creating new page or modifying existing page.
    # Check for changed text and save if so.
    notestext = '; '.join(notes)
    if notestext:
        if comment:
            comment += " (%s)" % notestext
        else:
            comment = notestext
    if page.text != existing_text:
        if save:
            pagemsg("Saving with comment = %s" % comment)
            blib.safe_page_save(page, comment, errandpagemsg)
        else:
            pagemsg("Would save with comment = %s" % comment)
Exemplo n.º 29
0
  # work). We'll also have problems with e.g. пистолет-пулемёт Томпсона,
  # because the words are linked individually but the ru-decl-noun-see
  # has пистолет-пулемёт given as a single entry. We have a check below
  # to try to catch this case, because no inflected nouns will show up.
  for i in xrange(1, len(headwords_separators), 2):
    hword = headwords_separators[i]
    separator = headwords_separators[i+1]
    if i < len(headwords_separators) - 2 and separator != " " and separator != "-":
      pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" %
          (wordind + 1, hword, separator))
      return
    # Canonicalize link in headword
    m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", hword)
    if m:
      lemma, infl = m.groups()
      lemma = ru.remove_accents(re.sub("#Russian$", "", lemma))
      if lemma == ru.remove_accents(infl):
        hword = "[[%s]]" % infl
      else:
        hword = "[[%s|%s]]" % (lemma, infl)
    headwords.append(hword)
    separators.append(separator)
    wordind += 1

  pagemsg("Found headwords: %s" % " @@ ".join(headwords))

  # Get headword genders (includes animacy and number)
  genders = blib.fetch_param_chain(headword_template, "2", "g")
  genders_include_pl = len([x for x in genders if re.search(r"\bp\b", x)]) > 0

  # Extract lemmas and inflections for each word in headword
Exemplo n.º 30
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  origtext = page.text
  parsed = blib.parse_text(origtext)

  # Find the declension arguments for LEMMA and inflected form INFL,
  # the WORDINDth word in the expression. Return value is a tuple of
  # four items: a list of (NAME, VALUE) tuples for the arguments, whether
  # the word is an adjective, the value of n= (if given), and the value
  # of a= (if given).
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None

    # ru-noun-table
    assert unicode(decl_template.name) == "ru-noun-table"

    # Split out the arg sets in the declension and check the
    # lemma of each one, taking care to handle cases where there is no lemma
    # (it would default to the page name).

    highest_numbered_param = 0
    for p in decl_template.params:
      pname = unicode(p.name)
      if re.search("^[0-9]+$", pname):
        highest_numbered_param = max(highest_numbered_param, int(pname))

    # Now gather the numbered arguments into arg sets. Code taken from
    # ru-noun.lua.
    offset = 0
    arg_sets = []
    arg_set = []
    for i in xrange(1, highest_numbered_param + 2):
      end_arg_set = False
      val = getparam(decl_template, str(i))
      if i == highest_numbered_param + 1:
        end_arg_set = True
      elif val == "_" or val == "-" or re.search("^join:", val):
        pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
      elif val == "or":
        end_arg_set = True

      if end_arg_set:
        arg_sets.append(arg_set)
        arg_set = []
        offset = i
      else:
        arg_set.append(val)

    canon_infl = ru.remove_accents(infl).lower()
    canon_lemma = lemma.lower()
    ispl = False
    need_sc1 = False
    found_gender = None
    if canon_infl != canon_lemma:
      for sgend, plend, gender, is_sc1 in pl_data:
        if sgend:
          check_sgend = sgend
        else:
          check_sgend = consonant_re
        if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
          ispl = True
          found_gender = gender
          need_sc1 = is_sc1
          break
      else:
        pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    # Substitute the wordlink for any lemmas in the declension.
    # If plural, also add gender and verify special case (1) as necessary.
    # Concatenate all the numbered params, substituting the wordlink into
    # the lemma as necessary.
    numbered_params = []
    for arg_set in arg_sets:
      lemma_arg = 0
      if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]):
        lemma_arg = 1
      if len(arg_set) <= lemma_arg:
        arg_set.append("")
      arglemma = arg_set[lemma_arg]
      manualtr = ""
      if "//" in arglemma:
        arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups()
      if (not arglemma or arglemma.lower() == infl.lower() or
          ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() ==
          ru.remove_accents(infl).lower() or
          ispl and ru.remove_accents(arglemma).lower() == lemma.lower()
          ):
        arg_set[lemma_arg] = wordlink + manualtr
      else:
        pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % (
          wordlink, arg_set[lemma_arg], ispl and ", skipping" or ""))
        if ispl:
          return None

      if ispl:
        # Add the gender
        if len(arg_set) <= lemma_arg + 1:
          arg_set.append("")
        declarg = arg_set[lemma_arg + 1]

        # First, sub in gender
        m = re.search("(3f|[mfn])", declarg)
        if found_gender == "mf":
          if not m:
            pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          decl_gender = m.group(1)
          if decl_gender == "n":
            pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          elif decl_gender in ["m", "3f"]:
            pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                (decl_gender, wordind, lemma, infl))
          else:
            assert gender == "f"
            pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" %
                (wordind, lemma, infl))
            declarg = re.sub("f", "3f", declarg, 1)
        else:
          if m:
            decl_gender = m.group(1)
            if decl_gender == found_gender:
              pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                  (found_gender, wordind, lemma, infl))
            else:
              pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" %
                  (decl_gender, wordind, found_gender, lemma, infl))
              declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1)
          else:
            pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" %
                (wordind, found_gender, lemma, infl))
            declarg = found_gender + declarg

        # Now check special case 1
        if need_sc1 != ("(1)" in declarg):
          if need_sc1:
            pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None
          else:
            pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None

        arg_set[lemma_arg + 1] = declarg

      if numbered_params:
        numbered_params.append("or")
      numbered_params.extend(arg_set)

    # Now gather all params, including named ones.
    params = []
    params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params))
    num = None
    anim = None
    for p in decl_template.params:
      pname = unicode(p.name)
      val = unicode(p.value)
      if pname == "a":
        anim = val
      elif pname == "n":
        num = val
      elif pname == "notes":
        params.append((pname, val))
      elif pname == "title":
        pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" %
            (wordind, lemma, infl, val))
      elif re.search("^[0-9]+$", pname):
        pass
      else:
        keepparam = True
        if pname == "loc":
          if pagetitle in keep_locative:
            pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          else:
            pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
            keepparam = False
        if pname == "par":
          pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if pname == "voc":
          pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if keepparam:
          if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
            pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" %
                (wordind, val, lemma, infl))
          pname += str(wordind)
          params.append((pname, val))
Exemplo n.º 31
0
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None
Exemplo n.º 32
0
    maintext = """{{ru-adv|%s%s}}

%s
""" % (term, trtext, defntext)
  else:
    full_pos = pos_to_full_pos[pos]
    maintext = """{{head|ru|%s|head=%s%s}}

%s
""" % (full_pos, full_pos.lower(), term, trtext, defntext)

  if defns == "--":
    maintext = ""

  # If both adjective and participle header, move related-terms text to level 3
  if maintext and parttext and reltext:
    reltext = re.sub("^====Related terms====", "===Related terms===", reltext)

  msg("""%s

%s==Russian==

%s%s===Pronunciation===
%s
%s===%s===
%s%s%s%s%s%s[[ru:%s]]

""" % (rulib.remove_accents(term), alsotext, alttext, etymtext, prontext,
  parttext, pos_to_full_pos[pos], maintext, syntext, anttext, dertext,
  reltext, seetext, rulib.remove_accents(term)))
def find_accented_2(term, termtr, verbose, pagemsg):
  if term in accentless_multisyllable:
    pagemsg("Not accenting unaccented multisyllabic particle %s" % term)
    return term, termtr
  # This can happen if e.g. we're passed "[[FOO|BAR]] BAZ"; we will reject it,
  # but it will then be word-split and handled correctly ("[[FOO|BAR]]" is
  # special-cased in find_accented_1()).
  if "|" in term:
    #pagemsg("Can't handle links with vertical bars: %s" % term)
    return term, termtr
  # This can happen if e.g. we're passed "[[FOO]] [[BAR]]"; we will reject it,
  # but it will then be word-split and handled correctly ("[[FOO]]" is
  # special-cased in find_accented_1()).
  if "[" in term or "]" in term:
    #pagemsg("Can't handle stray bracket in %s" % term)
    return term, termtr
  if "<" in term or ">" in term:
    pagemsg("Can't handle stray < or >: %s" % term)
    return term, termtr
  if u"\u0301" in term or u"ё" in term:
    pagemsg(u"Term has accent or ё, not looking up accents: %s" % term)
    return term, termtr
  if ru.is_monosyllabic(term):
    pagemsg("Term is monosyllabic, not looking up accents: %s" % term)
    return term, termtr
  pagename = ru.remove_accents(term)
  # We can't use expand_text() from find_accented_1() because it has a
  # different value for PAGENAME, and the proper value is important in
  # expanding ru-noun+ and ru-proper noun+.
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose)

  # Look up the page
  if semi_verbose:
    pagemsg("find_accented: Finding heads on page %s" % pagename)

  cached_redirect = False
  global num_cache_lookups
  num_cache_lookups += 1
  if pagename in accented_cache:
    global num_cache_hits
    num_cache_hits += 1
    result = accented_cache[pagename]
    cached = True
    if result is None:
      if semi_verbose:
        pagemsg("find_accented: Page %s doesn't exist (cached)" % pagename)
      return term, termtr
    elif result == "redirect":
      cached_redirect = True
      heads = set()
      saw_head = False
    else:
      heads, saw_head = result
  else:
    cached = False
    page = pywikibot.Page(site, pagename)
    try:
      if not page.exists():
        if semi_verbose:
          pagemsg("find_accented: Page %s doesn't exist" % pagename)
        if not global_disable_cache:
          accented_cache[pagename] = None
        return term, termtr
    except Exception as e:
      pagemsg("WARNING: Error checking page existence: %s" % unicode(e))
      if not global_disable_cache:
        accented_cache[pagename] = None
      return term, termtr

    # Page exists, find the heads
    heads = set()
    def add(val, tr):
      val_to_add = blib.remove_links(val)
      if val_to_add:
        heads.add((val_to_add, tr))
    saw_head = False
    for t in blib.parse(page).filter_templates():
      tname = unicode(t.name)
      if tname in ru_head_templates:
        saw_head = True
        if getparam(t, "1"):
          add(getparam(t, "1"), getparam(t, "tr"))
        elif getparam(t, "head"):
          add(getparam(t, "head"), getparam(t, "tr"))
      elif tname == "head" and getparam(t, "1") == "ru":
        saw_head = True
        add(getparam(t, "head"), getparam(t, "tr"))
      elif tname in ["ru-noun+", "ru-proper noun+"]:
        saw_head = True
        lemma = ru.fetch_noun_lemma(t, expand_text)
        lemmas = re.split(",", lemma)
        lemmas = [split_ru_tr(lemma) for lemma in lemmas]
        # Group lemmas by Russian, to group multiple translits
        lemmas = ru.group_translits(lemmas, pagemsg, expand_text)
        for val, tr in lemmas:
          add(val, tr)
      if saw_head:
        for i in xrange(2, 10):
          headn = getparam(t, "head" + str(i))
          if headn:
            add(headn, getparam(t, "tr" + str(i)))
    if not global_disable_cache:
      accented_cache[pagename] = (heads, saw_head)

  # We have the heads
  cached_msg = " (cached)" if cached else ""
  if len(heads) == 0:
    if not saw_head:
      if cached_redirect:
        pagemsg("Redirect without heads (cached)")
      elif not cached and re.match("#redirect", page.text, re.I):
        if not global_disable_cache:
          accented_cache[pagename] = "redirect"
        pagemsg("Redirect without heads")
      else:
        pagemsg("WARNING: Can't find any heads: %s%s" % (pagename, cached_msg))
    return term, termtr
  if len(heads) > 1:
    pagemsg("WARNING: Found multiple heads for %s%s: %s" % (pagename, cached_msg, ",".join("%s%s" % (ru, "//%s" % tr if tr else "") for ru, tr in heads)))
    return term, termtr
  newterm, newtr = list(heads)[0]
  if semi_verbose:
    pagemsg("find_accented: Found head %s%s%s" % (newterm, "//%s" % newtr if newtr else "", cached_msg))
  if re.search("[!?]$", newterm) and not re.search("[!?]$", term):
    newterm_wo_punc = re.sub("[!?]$", "", newterm)
    if ru.remove_accents(newterm_wo_punc) == ru.remove_accents(term):
      pagemsg("Removing punctuation from %s when matching against %s" % (
        newterm, term))
      newterm = newterm_wo_punc
  if ru.remove_accents(newterm) != ru.remove_accents(term):
    pagemsg("WARNING: Accented term %s differs from %s in more than just accents%s" % (
      newterm, term, cached_msg))
  return newterm, newtr