Пример #1
0
def generate_adj_forms(template,
                       errandpagemsg,
                       expand_text,
                       return_raw=False,
                       include_linked=False):

    if template.startswith("{{la-adecl|"):
        generate_template = re.sub(r"^\{\{la-adecl\|",
                                   "{{la-generate-adj-forms|", template)
    else:
        errandpagemsg(
            "Template %s not a recognized adjective declension template" %
            template)
        return None
    result = expand_text(generate_template)
    if return_raw:
        return None if result is False else result
    if not result:
        errandpagemsg("WARNING: Error generating forms, skipping")
        return None
    args = blib.split_generate_args(result)
    if not include_linked:
        args = {
            k: v
            for k, v in args.iteritems() if not k.startswith("linked_")
        }
    # Add missing feminine forms if needed
    augmented_args = {}
    for key, form in args.iteritems():
        augmented_args[key] = form
        if key.endswith("_m"):
            equiv_fem = key[:-2] + "_f"
            if equiv_fem not in args:
                augmented_args[equiv_fem] = form
    return augmented_args
Пример #2
0
def process_page(index, page, save, verbose):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    parsed = blib.parse(page)
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-conj", "ru-conj-old"] and getparam(
                t, "1").startswith("pf"):
            if tname == "ru-conj":
                tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms",
                                  unicode(t))
            else:
                tempcall = re.sub(r"\{\{ru-conj-old",
                                  "{{ru-generate-verb-forms|old=y", unicode(t))
            result = expand_text(tempcall)
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                continue
            args = blib.split_generate_args(result)
            for base in ["past_pasv_part", "ppp"]:
                for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]:
                    val = getparam(t, base + i)
                    if val and val != "-":
                        val = re.sub("//.*", "", val)
                        pagemsg(
                            "Found perfective past passive participle: %s" %
                            val)
Пример #3
0
def generate_noun_forms(template,
                        errandpagemsg,
                        expand_text,
                        return_raw=False,
                        include_linked=False):

    if template.startswith("{{la-ndecl|"):
        generate_template = re.sub(r"^\{\{la-ndecl\|",
                                   "{{la-generate-noun-forms|", template)
    else:
        errandpagemsg("Template %s not a recognized noun declension template" %
                      template)
        return None
    result = expand_text(generate_template)
    if return_raw:
        return None if result is False else result
    if not result:
        errandpagemsg("WARNING: Error generating forms, skipping")
        return None
    args = blib.split_generate_args(result)
    if not include_linked:
        args = {
            k: v
            for k, v in args.iteritems() if not k.startswith("linked_")
        }
    return args
Пример #4
0
def find_noun(pagename, pagemsg, errandpagemsg, expand_text):
  section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg)
  if not section:
    return None
  if "==Etymology" in section:
    return -1
  parsed = blib.parse_text(section)
  nouns = []
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun+":
      generate_template = re.sub(r"^\{\{ru-noun\+",
          "{{ru-generate-noun-forms", unicode(t))
      generate_result = expand_text(generate_template)
      if not generate_result:
        pagemsg("WARNING: Error generating noun forms")
        return None
      args = blib.split_generate_args(generate_result)
      lemma = args["nom_sg"] if "nom_sg" in args else args["nom_pl"]
      if "," in lemma:
        pagemsg("WARNING: Lemma has multiple forms: %s" % lemma)
        return None
      if lemma not in nouns:
        nouns.append(lemma)
  if len(nouns) > 1:
    pagemsg("WARNING: Multiple lemmas for noun: %s" % ",".join(nouns))
  if not nouns:
    return None
  return nouns[0]
Пример #5
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  global args
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  notes = []

  pagemsg("Processing")

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    newarg1 = None
    if tn == "de-conj":
      generate_template = re.sub(r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t))
      result = expand_text(generate_template)
      if not result:
        continue
      forms = blib.split_generate_args(result)
      pagemsg("For %s, class=%s" % (unicode(t), forms["class"]))

    if unicode(t) != origt:
      pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

  return unicode(parsed), notes
Пример #6
0
def generate_old_adj_forms(template,
                           errandpagemsg,
                           expand_text,
                           return_raw=False,
                           include_linked=False):
    def generate_adj_forms_prefix(m):
        decl_suffix_to_decltype = {
            'decl-1&2': '1&2',
            'decl-3rd-1E': '3-1',
            'decl-3rd-2E': '3-2',
            'decl-3rd-3E': '3-3',
            'decl-3rd-comp': '3-C',
            'decl-3rd-part': '3-P',
            'adecl-1st': '1-1',
            'adecl-2nd': '2-2',
            'decl-irreg': 'irreg',
        }
        if m.group(1) in decl_suffix_to_decltype:
            return "{{la-generate-adj-forms|decltype=%s|" % (
                decl_suffix_to_decltype[m.group(1)])
        return m.group(0)

    if template.startswith("{{la-adecl|"):
        generate_template = re.sub(r"^\{\{la-adecl\|",
                                   "{{la-generate-adj-forms|", template)
    else:
        generate_template = re.sub(r"^\{\{la-(.*?)\|",
                                   generate_adj_forms_prefix, template)
    if not generate_template.startswith("{{la-generate-adj-forms|"):
        errandpagemsg(
            "Template %s not a recognized adjective declension template" %
            template)
        return None
    result = expand_text(generate_template)
    if return_raw:
        return None if result is False else result
    if not result:
        errandpagemsg("WARNING: Error generating forms, skipping")
        return None
    args = blib.split_generate_args(result)
    if not include_linked:
        args = {
            k: v
            for k, v in args.iteritems() if not k.startswith("linked_")
        }
    # Add missing feminine forms if needed
    augmented_args = {}
    for key, form in args.iteritems():
        augmented_args[key] = form
        if key.endswith("_m"):
            equiv_fem = key[:-2] + "_f"
            if equiv_fem not in args:
                augmented_args[equiv_fem] = form
    return augmented_args
Пример #7
0
def process_decl(index, pagetitle, decl, forms, save, verbose):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if decl.startswith("{{ru-conj|"):
    tempcall = re.sub(r"^\{\{ru-conj", "{{ru-generate-verb-forms", decl)
  elif decl.startswith("{{ru-noun-table"):
    tempcall = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", decl)
  else:
    pagemsg("WARNING: Unrecognized decl template, skipping: %s" % decl)
    return

  result = expand_text(tempcall)
  if not result:
    pagemsg("WARNING: Error generating forms, skipping")
    return
  args = blib.split_generate_args(result)

  for form in forms:
    if form in args:
      for formpagename in re.split(",", args[form]):
        formpagename = re.sub("//.*$", "", formpagename)
        formpagename = rulib.remove_accents(formpagename)
        formpage = pywikibot.Page(site, formpagename)
        if not formpage.exists():
          pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
        elif formpagename == pagetitle:
          pagemsg("WARNING: Attempt to delete dictionary form, skipping")
        else:
          text = unicode(formpage.text)
          if "Etymology 1" in text:
            pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
          else:
            skip_form = False
            for m in re.finditer(r"^==([^=]*?)==$", text, re.M):
              if m.group(1) != "Russian":
                pagemsg("WARNING: Found entry for non-Russian language %s, skipping form %s" %
                    (m.group(1), formpagename))
                skip_form = True
            if not skip_form:
              comment = "Delete erroneously created form of %s" % pagetitle
              if save:
                formpage.delete(comment)
              else:
                pagemsg("Would delete page %s with comment=%s" %
                    (formpagename, comment))
Пример #8
0
 def generate_new_forms():
     new_generate_template = re.sub(
         r"^\{\{es-conj", "{{User:Benwing2/es-generate-verb-forms", newt)
     new_result = expand_text(new_generate_template)
     if not new_result:
         return None
     args = blib.split_generate_args(new_result)
     args = {
         k: v
         for k, v in args.iteritems()
         if not k.startswith("neg_") and k != "infinitive_linked"
     }
     args = {k: sort_multiple(v) for k, v in args.iteritems()}
     return args
Пример #9
0
def fetch_noun_args(t, expand_text, forms_only=False):
    generate_template = ("ru-generate-noun-forms"
                         if forms_only else "ru-generate-noun-args")
    if unicode(t.name) == "ru-noun+":
        generate_template = re.sub(r"^\{\{ru-noun\+",
                                   "{{%s" % generate_template, unicode(t))
    else:
        generate_template = re.sub(r"^\{\{ru-proper noun\+",
                                   "{{%s|ndef=sg" % generate_template,
                                   unicode(t))
    generate_result = expand_text(generate_template)
    if not generate_result:
        return None
    return blib.split_generate_args(generate_result)
def new_generate_noun_forms(template, errandpagemsg, expand_text, return_raw=False,
    include_props=False):
  assert template.startswith("{{la-ndecl|")
  if include_props:
    generate_template = re.sub(r"^\{\{la-ndecl\|", "{{User:Benwing2/la-new-generate-noun-props|",
        template)
  else:
    generate_template = re.sub(r"^\{\{la-ndecl\|", "{{User:Benwing2/la-new-generate-noun-forms|",
        template)
  result = expand_text(generate_template)
  if return_raw:
    return None if result is False else result
  if not result:
    errandpagemsg("WARNING: Error generating forms, skipping")
    return None
  return blib.split_generate_args(result)
Пример #11
0
def snarf_noun_accents_and_forms(noun, orig_pagemsg):
  global args
  pagetitle = bglib.remove_accents(noun)
  if pagetitle in nouns_to_accents_and_forms:
    return nouns_to_accents_and_forms[pagetitle]
  def pagemsg(txt):
    orig_pagemsg("Noun %s: %s" % (noun, txt))
  page = pywikibot.Page(site, pagetitle)
  parsed = blib.parse(page)
  lemma = None
  for t in parsed.filter_templates():
    if tname(t) in ["bg-noun", "bg-proper noun"]:
      if lemma:
        pagemsg("WARNING: Saw two {{bg-noun}} invocations without intervening {{bg-ndecl}}: %s" % unicode(t))
      lemma = getparam(t, "1")
      if not lemma:
        pagemsg("WARNING: Missing headword in noun: %s" % unicode(t))
        continue
      if bglib.needs_accents(lemma):
        pagemsg("WARNING: Noun %s missing an accent: %s" % (lemma, unicode(t)))
        lemma = False
        continue
    if tname(t) == "bg-ndecl":
      if lemma is False:
        pagemsg("WARNING: Skipping %s because noun missing an accent" % unicode(t))
        continue
      if lemma is None:
        pagemsg("WARNING: Skipping %s because no preceding {{bg-noun}}" % unicode(t))
        continue
      if pagetitle in nouns_to_accents_and_forms:
        pagemsg("WARNING: Saw two {{bg-ndecl}} on the same page: %s" % unicode(t))
        nouns_to_accents_and_forms[pagetitle] = (None, None)
        return (None, None)
      generate_template = re.sub(r"^\{\{bg-ndecl\|", "{{bg-generate-noun-forms|", unicode(t))
      def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)
      generate_result = expand_text(generate_template)
      if not generate_result:
        nouns_to_accents_and_forms[pagetitle] = (None, None)
        return (None, None)
      nouns_to_accents_and_forms[pagetitle] = (lemma, blib.split_generate_args(generate_result))
  if pagetitle in nouns_to_accents_and_forms:
    return nouns_to_accents_and_forms[pagetitle]
  pagemsg("WARNING: Couldn't find both lemma and declension")
  nouns_to_accents_and_forms[pagetitle] = (None, None)
  return (None, None)
Пример #12
0
def generate_verb_forms(template,
                        errandpagemsg,
                        expand_text,
                        return_raw=False,
                        include_linked=False,
                        include_props=False,
                        add_sync_forms=False):
    if template.startswith("{{la-conj|"):
        if include_props:
            generate_template = re.sub(r"^\{\{la-conj\|",
                                       "{{la-generate-verb-props|", template)
        else:
            generate_template = re.sub(r"^\{\{la-conj\|",
                                       "{{la-generate-verb-forms|", template)
    else:
        errandpagemsg("Template %s not a recognized conjugation template" %
                      template)
        return None
    result = expand_text(generate_template)
    if return_raw:
        return None if result is False else result
    if not result:
        errandpagemsg("WARNING: Error generating forms, skipping")
        return None
    args = blib.split_generate_args(result)
    if not include_linked:
        args = {
            k: v
            for k, v in args.iteritems() if not k.startswith("linked_")
        }

    def augment_with_sync_forms(forms):
        forms = forms.split(",")
        augmented_forms = []
        for form in forms:
            augmented_forms.append(form)
            if re.search(
                    u"(vi(stī|stis)|vērunt|ver(am|ās|at|āmus|ātis|ant|ō|im|[iī]s|it|[iī]mus|[iī]tis|int)|viss(e|em|ēs|et|ēmus|ētis|ent))$",
                    form):
                augmented_forms.append(re.sub(u"^(.*)v[ieē]", r"\1", form))
        return ",".join(augmented_forms)

    if add_sync_forms:
        args = {k: augment_with_sync_forms(v) for k, v in args.iteritems()}
    return args
Пример #13
0
def generate_old_noun_forms(template, errandpagemsg, expand_text, return_raw=False,
  include_linked=False):

  def generate_noun_forms_prefix(m):
    if m.group(1) in la_noun_decl_suffix_to_decltype:
      declspec, stem_suffix, pl_suffix, to_auto = la_noun_decl_suffix_to_decltype[m.group(1)]
      if type(declspec) is not tuple:
        declspec = (declspec,)
      decl = declspec[0]
      if len(declspec) == 1:
        decltype = ""
        num = ""
      else:
        decltype = "|decl_type=%s" % declspec[1]
        if len(declspec) == 2:
          num = ""
        else:
          num = "|num=%s" % declspec[2]
      return "{{la-generate-noun-forms|decl=%s%s%s|" % (
        decl, decltype, num
      )
    return m.group(0)

  if template.startswith("{{la-ndecl|"):
    generate_template = re.sub(r"^\{\{la-ndecl\|", "{{la-generate-noun-forms|",
        template)
  else:
    generate_template = re.sub(r"^\{\{la-decl-(.*?)\|", generate_noun_forms_prefix,
        template)
  if not generate_template.startswith("{{la-generate-noun-forms|"):
    errandpagemsg("Template %s not a recognized noun declension template" % template)
    return None
  result = expand_text(generate_template)
  if return_raw:
    return None if result is False else result
  if not result:
    errandpagemsg("WARNING: Error generating forms, skipping")
    return None
  args = blib.split_generate_args(result)
  if not include_linked:
    args = {k: v for k, v in args.iteritems() if not k.startswith("linked_")}
  return args
Пример #14
0
for index, decl in blib.iter_items(yield_decls(), start, end):
  module = uk if args.lang == "uk" else be
  if decl.startswith("(("):
    m = re.search(r"^\(\((.*)\)\)$", decl)
    subdecls = m.group(1).split(",")
    decl_for_page = subdecls[0]
  else:
    decl_for_page = decl
  m = re.search(r"^(.+?)<.*>$", decl_for_page)
  if not m:
    msg("WARNING: Can't extract lemma from decl: %s" % decl)
    pagename = "UNKNOWN"
  else:
    pagename = module.remove_accents(blib.remove_links(m.group(1)))
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagename, pagemsg, args.verbose)
  tempcall = "{{%s-generate-noun-forms|%s}}" % (args.lang, decl)
  result = expand_text(tempcall)
  if not result:
    continue
  predforms = blib.split_generate_args(result)
  lemma = predforms["nom_s"] if "nom_s" in predforms else predforms["nom_p"]
  real_pagename = re.sub(",.*", "", module.remove_accents(blib.remove_links(lemma)))
  page = pywikibot.Page(site, real_pagename)
  def do_replace_decl(page, index, parsed):
    return replace_decl(page, index, parsed, decl, predforms)
  blib.do_edit(page, index, do_replace_decl, save=args.save, verbose=args.verbose,
      diff=args.diff)
Пример #15
0
def compare_new_and_old_templates(origt, newt, pagetitle, pagemsg, errandpagemsg):
  global args
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  old_generate_template = re.sub(r"\}\}$", "|generate_forms=1}}", unicode(origt))
  old_result = expand_text(old_generate_template)
  if not old_result:
    return None

  new_generate_template = re.sub(r"^\{\{de-conj\|", "{{User:Benwing2/de-generate-verb-forms|", unicode(newt))
  new_result = expand_text(new_generate_template)
  if not new_result:
    return None

  def remove_forms_in(forms, regex):
    forms = forms.split(",")
    forms = [form for form in forms if not re.search(regex, form)]
    return ",".join(forms)

  newarg1 = re.sub("<.*>", "", getparam(newt, "1")) or pagetitle
  if old_result is None:
    errandpagemsg("WARNING: Error generating old forms, can't compare")
    return False
  old_forms = blib.split_generate_args(old_result)
  if not re.search("[._]", newarg1):
    old_forms = {k: v for k, v in old_forms.items() if k != "zu_infinitive" and not k.startswith("subc")}
  old_forms = {k: v.replace("&#32;", " ").replace("&nbsp;", " ").strip().replace(" ,", ",") for k, v in old_forms.items()}
  if "_" in newarg1 and "zu_infinitive" in old_forms:
    # Fix bug in old form zu-infinitive
    old_forms["zu_infinitive"] = old_forms["zu_infinitive"].replace(" zu", " zu ")
  if "imp_2s" in old_forms and re.search("[dt]en$", newarg1):
    # Old code leaves out imperative without -e
    forms = old_forms["imp_2s"].split(",")
    if not [x for x in forms if not re.search("e($| )", x)]:
      nforms = []
      for form in forms:
        if re.search("e($| )", form):
          nforms.append(re.sub("e($| )", r"\1", form))
        nforms.append(form)
      old_forms["imp_2s"] = ",".join(nforms)
  if new_result is None:
    errandpagemsg("WARNING: Error generating new forms, can't compare")
    return False
  new_forms = blib.split_generate_args(new_result)
  if "subii_2s" in new_forms:
    # New code generates subii 2s in both -est and -st; old only in -est
    new_forms["subii_2s"] = remove_forms_in(new_forms["subii_2s"], u"^[^ ]*([^e]|ie)[sxßz]t($| )")
  if "subii_2p" in new_forms:
    # New code generates subii 2p in both -et and -t; old only in -et
    new_forms["subii_2p"] = remove_forms_in(new_forms["subii_2p"], "^[^ ]*[^e]t($| )")
  if "subc_subii_2s" in new_forms:
    # New code generates subii 2s in both -est and -st; old only in -est
    new_forms["subc_subii_2s"] = remove_forms_in(new_forms["subc_subii_2s"], u"([^e]|ie)[sxßz]t$")
  if "subc_subii_2p" in new_forms:
    # New code generates subii 2p in both -et and -t; old only in -et
    new_forms["subc_subii_2p"] = remove_forms_in(new_forms["subc_subii_2p"], "[^e]t$")
  #if "perf_sub_2s" in new_forms and "seiest" in new_forms["perf_sub_2s"] and not re.search("e[rl]n$", newarg1):
  #  # New code generates perf sub 2s in both seist and seiest; old only in seist
  #  new_forms["perf_sub_2s"] = remove_forms_in(new_forms["perf_sub_2s"], "seiest")
  if re.search(u"[sxzß]en$", newarg1):
    if "pret_2s" in new_forms:
      # New code generates pret 2s for -sen verbs in both -sest and -st; old only in -st
      new_forms["pret_2s"] = remove_forms_in(new_forms["pret_2s"], u"^[^ ]*[sxzß]est($| )")
    if "subc_pret_2s" in new_forms:
      # New code generates pret 2s for -sen verbs in both -sest and -st; old only in -st
      new_forms["subc_pret_2s"] = remove_forms_in(new_forms["subc_pret_2s"], u"[sxzß]est$")
  if re.search(u"[td]en$", newarg1):
    if "pret_2s" in new_forms:
      # New code generates pret 2s for -ten verbs in both -test and -tst; old only in -test
      new_forms["pret_2s"] = remove_forms_in(new_forms["pret_2s"], u"^[^ ]*[td]st($| )")
    if "subc_pret_2s" in new_forms:
      # New code generates pret 2s for -sen verbs in both -test and -tst; old only in -test
      new_forms["subc_pret_2s"] = remove_forms_in(new_forms["subc_pret_2s"], u"[td]st$")

  for form in set(old_forms.keys() + new_forms.keys()):
    if form not in new_forms:
      pagemsg("WARNING: for original %s and new %s, form %s=%s in old forms but missing in new forms" % (
        unicode(origt), unicode(newt), form, old_forms[form]))
      return False
    if form not in old_forms:
      pagemsg("WARNING: for original %s and new %s, form %s=%s in new forms but missing in old forms" % (
        unicode(origt), unicode(newt), form, new_forms[form]))
      return False
    if set(new_forms[form].split(",")) != set(old_forms[form].split(",")):
      pagemsg("WARNING: for original %s and new %s, form %s=%s in old forms but =%s in new forms" % (
        unicode(origt), unicode(newt), form, old_forms[form], new_forms[form]))
      return False
  pagemsg("%s and %s have same forms" % (unicode(origt), unicode(newt)))
  return True
Пример #16
0
def process_page(index, page, direc, delete_bad, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  direc = direc.replace("3oa", u"3°a")
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj"]:
      conjtype = getparam(t, "1")
      if not conjtype.startswith("3olda"):
        continue
      if conjtype.startswith("3olda") and conjtype != "3olda":
        pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t))
        continue
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      oldargs = blib.split_generate_args(result)
      rmparam(t, "6")
      rmparam(t, "5")
      rmparam(t, "4")
      t.add("1", direc)
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      if delete_bad:
        newargs = blib.split_generate_args(result)
        for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short",
            "past_f_short", "past_n_short", "past_pl_short"]:
          oldforms = re.split(",", oldargs[form]) if form in oldargs else []
          newforms = re.split(",", newargs[form]) if form in newargs else []
          for oldform in oldforms:
            if oldform not in newforms:
              formpagename = rulib.remove_accents(oldform)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Attempt to delete dictionary form, skipping")
              else:
                text = unicode(formpage.text)
                if "Etymology 1" in text:
                  pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
                elif "----" in text:
                  pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename)
                else:
                  numinfls = len(re.findall(r"\{\{inflection of\|", text))
                  if numinfls < 1:
                    pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename)
                  elif numinfls > 1:
                    pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename)
                  else:
                    comment = "Delete erroneously created long form of %s" % pagetitle
                    pagemsg("Existing text for form %s: [[%s]]" % (
                      formpagename, text))
                    if save:
                      formpage.delete(comment)
                    else:
                      pagemsg("Would delete page %s with comment=%s" %
                          (formpagename, comment))

      notes.append("fix 3olda -> %s" % direc)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
Пример #17
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    pagemsg("Processing")

    manual_ppp_forms = [
        "past_pasv_part", "past_pasv_part2", "past_pasv_part3",
        "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4"
    ]
    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        tname = unicode(t.name)
        if tname == "ru-conj":
            manual_ppps = []
            for form in manual_ppp_forms:
                ppp = getparam(t, form)
                if ppp and ppp != "-":
                    manual_ppps.append(ppp)
            if not manual_ppps:
                continue
            if [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
                        unicode(t))
                continue
            curvariant = getparam(t, "2")
            if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant:
                pagemsg(
                    "WARNING: Found both manual PPP and PPP variant, something wrong: %s"
                    % unicode(t))
                continue
            t2 = blib.parse_text(unicode(t)).filter_templates()[0]
            for form in manual_ppp_forms:
                rmparam(t2, form)
            variants_to_try = ["+p"]
            if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]):
                variants_to_try.append(u"+pё")
            if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]:
                variants_to_try.append(u"+pжд")
            notsamemsgs = []
            for variant in variants_to_try:
                t2.add("2", curvariant + variant)
                tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms",
                                  unicode(t2))
                result = expand_text(tempcall)
                if not result:
                    pagemsg("WARNING: Error generating forms, skipping")
                    continue
                args = blib.split_generate_args(result)
                if "past_pasv_part" not in args:
                    pagemsg(
                        "WARNING: Something wrong, no past passive participle generated: %s"
                        % unicode(t))
                    continue
                auto_ppps = []
                for form in manual_ppp_forms:
                    if form in args:
                        for ppp in re.split(",", args[form]):
                            if ppp and ppp != "-":
                                auto_ppps.append(ppp)
                if manual_ppps == auto_ppps:
                    pagemsg(
                        "Manual PPP's %s same as auto-generated PPP's, switching to auto"
                        % ",".join(manual_ppps))
                    for form in manual_ppp_forms:
                        rmparam(t, form)
                    t.add("2", curvariant + variant)
                    notes.append("replaced manual PPP's with variant %s" %
                                 variant)
                    break
                else:
                    notsamemsgs.append(
                        "WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s"
                        % (",".join(manual_ppps), ",".join(auto_ppps),
                           unicode(t)))
            else:  # no break in for loop
                for m in notsamemsgs:
                    pagemsg(m)

        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Пример #18
0
def process_page(page, index, do_fix):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname in ["ru-conj", "ru-conj-old"]:
            if [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
                        unicode(t))
                continue
            conjtype = getparam(t, "2")
            if tname == "ru-conj":
                tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms",
                                  unicode(t))
            else:
                tempcall = re.sub(r"\{\{ru-conj-old",
                                  "{{ru-generate-verb-forms|old=y", unicode(t))
            result = expand_text(tempcall)
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                continue
            args = blib.split_generate_args(result)
            for base in ["past_pasv_part", "ppp"]:
                forms_to_remove = []
                if args[base] == "-":
                    continue
                for form in re.split(",", args[base]):
                    origform = form
                    form = re.sub("//.*", "", form)
                    fix_form = False
                    if not re.search(ur"([аяеё]́?нный|тый)$", form):
                        pagemsg(
                            "WARNING: Past passive participle doesn't end correctly: %s"
                            % form)
                        fix_form = True
                    unstressed_page = rulib.make_unstressed_ru(pagetitle)
                    unstressed_form = rulib.make_unstressed_ru(form)
                    warned = False
                    if unstressed_form[0] != unstressed_page[0]:
                        pagemsg(
                            "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s"
                            % form)
                        warned = True
                        fix_form = True
                    if form.endswith(u"нный"):
                        if pagetitle.endswith(u"ать"):
                            good_ending = u"анный"
                        elif pagetitle.endswith(u"ять"):
                            good_ending = u"янный"
                        else:
                            good_ending = u"енный"
                        if not unstressed_form.endswith(good_ending):
                            pagemsg(
                                "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s"
                                % form)
                            warned = True
                            fix_form = True
                    if not warned:
                        correct_form = form_ppp(conjtype, pagetitle, args)
                        if correct_form and unstressed_form != correct_form:
                            pagemsg(
                                "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s"
                                % (unstressed_form, correct_form))
                            fix_form = True
                    if fix_form:
                        forms_to_remove.append(origform)
                if forms_to_remove and do_fix:
                    curvals = []
                    for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]:
                        val = getparam(t, base + i)
                        if val:
                            curvals.append(val)
                    newvals = [x for x in curvals if x not in forms_to_remove]
                    if len(curvals) - len(newvals) != len(forms_to_remove):
                        pagemsg(
                            "WARNING: Something wrong, couldn't remove all PPP forms %s"
                            % ",".join(forms_to_remove))
                    curindex = 1
                    origt = unicode(t)
                    for newval in newvals:
                        t.add(base + ("" if curindex == 1 else str(curindex)),
                              newval)
                        curindex += 1
                    for i in xrange(curindex, 10):
                        rmparam(t, base + ("" if i == 1 else str(i)))
                    pagemsg("Replacing %s with %s" % (origt, unicode(t)))
                    notes.append("removed bad past pasv part(s) %s" %
                                 ",".join(forms_to_remove))
Пример #19
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    saw_noun = None
    saw_proper_noun = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-noun":
            if saw_noun:
                pagemsg(
                    "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_noun), unicode(t)))
                return
            saw_noun = t
        elif tn == "la-proper noun":
            if saw_proper_noun:
                pagemsg(
                    "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_proper_noun), unicode(t)))
                return
            saw_proper_noun = t
    if saw_noun and saw_proper_noun:
        pagemsg(
            "WARNING: Saw both noun and proper noun, can't correct header/headword"
        )
        return
    if not saw_noun and not saw_proper_noun:
        pagemsg(
            "WARNING: Saw neither noun nor proper noun, can't correct header/headword"
        )
        return
    pos = "pn" if saw_proper_noun else "n"
    ht = saw_proper_noun or saw_noun
    if getparam(ht, "indecl"):
        pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht))
        return
    generate_template = blib.parse_text(unicode(ht)).filter_templates()[0]
    blib.set_template_name(generate_template, "la-generate-noun-forms")
    blib.remove_param_chain(generate_template, "lemma", "lemma")
    blib.remove_param_chain(generate_template, "m", "m")
    blib.remove_param_chain(generate_template, "f", "f")
    blib.remove_param_chain(generate_template, "g", "g")
    rmparam(generate_template, "type")
    rmparam(generate_template, "indecl")
    rmparam(generate_template, "id")
    rmparam(generate_template, "pos")
    result = expand_text(unicode(generate_template))
    if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        return
    tempargs = blib.split_generate_args(result)
    forms_seen = set()
    slots_and_forms_to_process = []
    for slot, formarg in tempargs.iteritems():
        forms = formarg.split(",")
        for form in forms:
            if "[" in form or "|" in form:
                continue
            form_no_macrons = lalib.remove_macrons(form)
            if form_no_macrons == pagetitle:
                continue
            if form_no_macrons in forms_seen:
                continue
            forms_seen.add(form_no_macrons)
            slots_and_forms_to_process.append((slot, form))
    for index, (slot, form) in blib.iter_items(
            sorted(slots_and_forms_to_process,
                   key=lambda x: lalib.remove_macrons(x[1]))):

        def handler(page, index, parsed):
            return process_form(page, index, slot, form, pos)

        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                     index,
                     handler,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
Пример #20
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    headt = None
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn in ["de-verb-old", "de-verb-strong", "de-verb-weak"
                  ] or tn == "head" and getparam(t, "1") == "de" and getparam(
                      t, "2") == "verb":
            if headt:
                pagemsg(
                    "WARNING: Encountered headword twice without declension: old %s, current %s"
                    % (unicode(headt), unicode(t)))
                return
            headt = t
            headtn = tn
        if tn == "de-conj":
            if not headt:
                pagemsg("WARNING: Encountered conj without headword: %s" %
                        unicode(t))
                return
            param4_ignorable = False
            if getparam(headt, "4") in ["h", "haben", "s", "sein"]:
                param4_ignorable = True
            for param in headt.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not pv:
                    continue
                if headtn == "head":
                    allowed_params = ["1", "2", "head"]
                elif headtn == "de-verb-weak":
                    allowed_params = ["1", "2", "3", "auxiliary", "cat"]
                elif headtn == "de-verb-strong":
                    allowed_params = [
                        "1", "2", "3", "class", "class 2", "pres 2",
                        "pres 2 qual", "past 2", "past 2 qual",
                        "past participle 2", "past participle 2 qual",
                        "past subjunctive", "past subjunctive 2",
                        "past subjunctive 2 qual", "auxiliary", "cat"
                    ]
                else:
                    allowed_params = ["head"]
                if param4_ignorable:
                    allowed_params.append("4")
                if pn not in allowed_params:
                    pagemsg("WARNING: Encountered unknown param %s=%s in %s" %
                            (pn, pv, unicode(headt)))
                    return

            def canonicalize_existing(forms):
                forms = [re.sub(" '*or'* ", ",", form) for form in forms]
                forms = [
                    splitform for form in forms
                    for splitform in form.split(",")
                ]
                return [blib.remove_links(form) for form in forms if form]

            def compare(old, new, entities_compared):
                if not old:
                    return True
                if set(old) != set(new):
                    pagemsg(
                        "WARNING: Old %s %s disagree with new %s %s: head=%s, decl=%s"
                        % (entities_compared, ",".join(old), entities_compared,
                           ",".join(new), unicode(headt), unicode(t)))
                    return False
                return True

            def fetch_aux():
                aux = getparam(headt, "auxiliary")
                if aux in ["haben", "sein"]:
                    aux = [aux]
                elif aux == "both":
                    aux = ["haben", "sein"]
                elif not aux:
                    aux = []
                else:
                    pagemsg(
                        "WARNING: Unrecognized auxiliary=%s, skipping: %s" %
                        (aux, unicode(headt)))
                    return None
                if not aux:
                    param4 = getparam(headt, "4")
                    if param4 in ["h", "haben"]:
                        aux = ["haben"]
                    elif param4 in ["s", "sein"]:
                        aux = ["sein"]
                return aux

            if headtn == "de-verb-weak":
                generate_template = re.sub(
                    r"^\{\{de-conj(?=[|}])",
                    "{{User:Benwing2/de-generate-verb-props", unicode(t))
                result = expand_text(generate_template)
                if not result:
                    continue
                forms = blib.split_generate_args(result)
                pres_3s = canonicalize_existing([getparam(headt, "1")])
                past = canonicalize_existing([getparam(headt, "2")])
                pp = canonicalize_existing([getparam(headt, "3")])
                aux = fetch_aux()
                if aux is None:
                    return
                if (not compare(pres_3s,
                                forms.get("pres_3s", "-").split(","),
                                "pres 3sgs")
                        or not compare(past,
                                       forms.get("pret_3s", "-").split(","),
                                       "pasts")
                        or not compare(pp,
                                       forms.get("perf_part", "-").split(","),
                                       "pp's")
                        or not compare(aux,
                                       forms.get("aux", "-").split(","),
                                       "auxes")):
                    headt = None
                    continue
            if headtn == "de-verb-strong":
                generate_template = re.sub(
                    r"^\{\{de-conj(?=[|}])",
                    "{{User:Benwing2/de-generate-verb-props", unicode(t))
                result = expand_text(generate_template)
                if not result:
                    continue
                forms = blib.split_generate_args(result)
                pres_3s = canonicalize_existing(
                    [getparam(headt, "1"),
                     getparam(headt, "pres 2")])
                past = canonicalize_existing(
                    [getparam(headt, "2"),
                     getparam(headt, "past 2")])
                pp = canonicalize_existing([
                    getparam(headt, "3"),
                    getparam(headt, "past participle 2")
                ])
                past_subj = canonicalize_existing([
                    getparam(headt, "past subjunctive"),
                    getparam(headt, "past subjunctive 2")
                ])
                clazz = canonicalize_existing(
                    [getparam(headt, "class"),
                     getparam(headt, "class 2")])
                aux = fetch_aux()
                if aux is None:
                    return
                if (not compare(pres_3s,
                                forms.get("pres_3s", "-").split(","),
                                "pres 3sgs")
                        or not compare(past,
                                       forms.get("pret_3s", "-").split(","),
                                       "pasts")
                        or not compare(pp,
                                       forms.get("perf_part", "-").split(","),
                                       "pp's")
                        or not compare(past_subj,
                                       forms.get("subii_3s", "-").split(","),
                                       "past subjs") or
                        not compare(aux,
                                    forms.get("aux", "-").split(","), "auxes")
                        or not compare(clazz,
                                       forms.get("class", "-").split(","),
                                       "classes")):
                    headt = None
                    continue

            del headt.params[:]
            blib.set_template_name(headt, "de-verb")
            arg1 = getparam(t, "1")
            if arg1:
                headt.add("1", arg1)
            notes.append("replace {{%s|...}} with new-style {{de-verb%s}}" %
                         (headtn == "head" and "head|de|verb" or headtn,
                          (arg1 and "|" + arg1 or "")))
            headt = None

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Пример #21
0
def process_section(index, pagetitle, sectext):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)
  pagemsg("Processing")
  notes = []
  conjt = None
  parsed = blib.parse_text(sectext)
  for t in parsed.filter_templates():
    tn = tname(t)
    forms = {}

    if tn == "be-conj-manual":
      if conjt:
        pagemsg("WARNING: Saw two conjugation templates %s and %s, skipping" %
          (unicode(conjt), unicode(t)))
        return sectext, notes
      conjt = t
  if not conjt:
    pagemsg("WARNING: Couldn't find conjugation template")
    return sectext, notes
  autoconj = None
  for m in re.finditer("<!-- type (.*?) -->", sectext):
    if autoconj:
      pagemsg("WARNING: Saw two autoconj comments %s and %s, skipping" % (
        autoconj, m.group(1)))
      return sectext, notes
    autoconj = m.group(1)
    autoconj = re.sub(" PPP[=:].*", "", autoconj)
    if " " in autoconj:
      pagemsg("WARNING: Space in autoconj, skipping: %s" % autoconj)
      return sectext, notes
  if not autoconj:
    pagemsg("WARNING: Couldn't find autoconj comment")
    return sectext, notes
  if not autoconj.startswith("(("):
    infinitive = getparam(conjt, "infinitive").strip()
    if not infinitive:
      pagemsg("WARNING: Couldn't find infinitive=: %s" % unicode(conjt))
      return sectext, notes
    autoconj = "%s<%s>" % (infinitive, autoconj)
  tempcall = "{{User:Benwing2/be-generate-verb-forms|%s}}" % autoconj
  result = expand_text(tempcall)
  if not result:
    return sectext, notes
  pagemsg(result)
  predforms = blib.split_generate_args(result)
  forms = {}
  aspect = getparam(conjt, "aspect").strip()
  for slot in be_conj_slots:
    form = getparam(conjt, slot).strip()
    if form and form != "-":
      if slot.startswith("pres_futr_"):
        if aspect == "pf":
          forms[slot.replace("pres_", "")] = form
        else:
          forms[slot.replace("futr_", "")] = form
      else:
        forms[slot] = form
  if compare_forms(autoconj, forms, predforms, pagemsg):
    origt = unicode(conjt)
    conjt.name = "be-conj"
    del conjt.params[:]
    conjt.add("1", autoconj)
    newt = unicode(conjt)
    pagemsg("Replaced %s with %s" % (origt, newt))
    notes.append("replace {{be-conj-manual|...}} with %s" % newt)
  sectext = unicode(parsed)
  if notes:
    sectext = re.sub("<!-- type (.*?) -->", "", sectext)

  return sectext, notes
Пример #22
0
def process_page(index, page, save, verbose, nouns, adjectives):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  if re.search(u"с[яь]$", pagetitle):
    pagemsg("Skipping reflexive verb")
    return

  text = unicode(page.text)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-conj":
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      conjtype = getparam(t, "2")
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      args = blib.split_generate_args(result)
      if "infinitive" not in args: # e.g. обнимать
        pagemsg("WARNING: No infinitive")
        continue
      infinitive = args["infinitive"]
      if "," in infinitive:
        pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive)
        continue
      if "//" in infinitive:
        pagemsg("WARNING: Infinitive has translit: %s" % infinitive)
        continue
      ppp = form_ppp(conjtype, pagetitle, args)
      if not ppp:
        continue
      if ppp.endswith(u"тый"):
        verbal_noun = re.sub(u"тый$", u"тие", ppp)
        verbal_noun_suffix = u"тие"
        verbal_adj = re.sub(u"тый$", u"тельный", ppp)
        verbal_adj_suffix = u"тельный"
      elif ppp.endswith(u"ённый"):
        verbal_noun = re.sub(u"ённый$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"ённый$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      elif ppp.endswith(u"енный"):
        verbal_noun = re.sub(u"енный$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"енный$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      else:
        assert ppp.endswith(u"анный") or ppp.endswith(u"янный")
        verbal_noun = re.sub(u"нный$", u"ние", ppp)
        verbal_adj = re.sub(u"нный$", u"тельный", ppp)
        m = re.search(u"(.)нный$", ppp)
        suffix_start = m.group(1)
        verbal_noun_suffix = suffix_start + u"ние"
        verbal_adj_suffix = suffix_start + u"тельный"
      agent_noun = re.sub(u"ный$", "", verbal_adj)
      agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix)
      stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix)
      stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix)
      stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix)
      if conjtype.startswith("7"):
        stem = getparam(t, "4")
        if infinitive.endswith(u"ть"):
          stem = stem.replace(u"ё", u"е́")
        else:
          stem = rulib.make_unstressed_ru(stem)
        stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-"
      elif conjtype.startswith("8"):
        stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-"
      else:
        stem = rulib.remove_monosyllabic_accents(infinitive)

      if verbal_noun in nouns:
        stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text)
        if not stressed_noun:
          msg("%s no-etym FIXME" % verbal_noun)
        elif stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_noun)
        else:
          if stressed_noun.endswith(stressed_verbal_noun_suffix):
            suffix = stressed_verbal_noun_suffix
          else:
            suffix = verbal_noun_suffix
          msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix))

      if agent_noun in nouns:
        stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text)
        if stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % agent_noun)
        else:
          msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem))

      if verbal_adj in adjectives:
        stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text)
        if stressed_adj == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_adj)
        else:
          msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))
Пример #23
0
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None):
    if not expand_text:

        def expand_text(tempcall):
            return blib.expand_text(tempcall, pagename, pagemsg, False)

    tn = tname(t)
    if tn in [
            "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det",
            "la-pronoun"
    ]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            retval = getparam(t, "1")
            if "<" in retval or "((" in retval or " " in retval or "-" in retval:
                generate_template = blib.parse_text(
                    unicode(t)).filter_templates()[0]
                blib.set_template_name(generate_template,
                                       "la-generate-adj-forms")
                blib.remove_param_chain(generate_template, "comp", "comp")
                blib.remove_param_chain(generate_template, "sup", "sup")
                blib.remove_param_chain(generate_template, "adv", "adv")
                blib.remove_param_chain(generate_template, "lemma", "lemma")
                rmparam(generate_template, "type")
                # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
                rmparam(generate_template, "indecl")
                rmparam(generate_template, "id")
                rmparam(generate_template, "pos")
                result = expand_text(unicode(generate_template))
                if not result:
                    pagemsg("WARNING: Error generating forms, skipping")
                    retval = ""
                else:
                    args = blib.split_generate_args(result)
                    if "linked_nom_sg_m" in args:
                        retval = args["linked_nom_sg_m"]
                    elif "linked_nom_pl_m" in args:
                        retval = args["linked_nom_pl_m"]
                    else:
                        pagemsg(
                            "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s"
                            % (unicode(generate_template), result))
                        retval = ""
                    retval = retval.split(",")
            else:
                retval = re.sub("/.*", "", retval)
    elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-noun-forms")
            blib.remove_param_chain(generate_template, "lemma", "lemma")
            blib.remove_param_chain(generate_template, "m", "m")
            blib.remove_param_chain(generate_template, "f", "f")
            blib.remove_param_chain(generate_template, "g", "g")
            rmparam(generate_template, "type")
            # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
            rmparam(generate_template, "indecl")
            rmparam(generate_template, "id")
            rmparam(generate_template, "pos")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                if "linked_nom_sg" in args:
                    retval = args["linked_nom_sg"]
                elif "linked_nom_pl" in args:
                    retval = args["linked_nom_pl"]
                else:
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in ["la-verb", "la-suffix-verb"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-verb-forms")
            rmparam(generate_template, "id")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                for slot in [
                        "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc",
                        "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc"
                ]:
                    if slot in args:
                        retval = args[slot]
                        break
                else:
                    # no break
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or (
            tn in ["la-suffix", "la-suffix-adv", "la-gerund"]):
        retval = getparam(t, "1")
    elif tn == "la-letter":
        retval = pagename
    elif tn in ["head", "la-prep"]:
        retval = blib.fetch_param_chain(t, "head", "head")
    elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates:
        retval = blib.fetch_param_chain(t, "1", "head")
    else:
        pagemsg("WARNING: Unrecognized headword template %s" % unicode(t))
        retval = ""
    retval = retval or pagename
    if type(retval) is not list:
        retval = [retval]
    return retval
Пример #24
0
def process_page(page, index, parsed):
    global args
    verbose = args.verbose
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    parsed = blib.parse(page)

    headword_template = None
    see_template = None
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
            if headword_template:
                pagemsg("WARNING: Multiple headword templates, skipping")
                return
            headword_template = t
        if unicode(t.name) in ["ru-decl-noun-see"]:
            if see_template:
                pagemsg(
                    "WARNING: Multiple ru-decl-noun-see templates, skipping")
                return
            see_template = t
    if not headword_template:
        pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping")
        return
    if not see_template:
        pagemsg("WARNING: No ru-decl-noun-see templates, skipping")
        return

    del see_template.params[:]
    for param in headword_template.params:
        see_template.add(param.name, param.value)
    see_template.name = "ru-noun-table"

    if unicode(headword_template.name) == "ru-proper noun+":
        # Things are trickier for proper nouns because they default to n=sg, whereas
        # ru-noun-table defaults to n=both. We have to expand both templates and
        # fetch the value of n, and set it in ru-noun-table if not the same.

        # 1. Generate args for headword proper-noun template, using |ndef=sg
        #    because ru-proper noun+ defaults to sg and ru-generate-noun-args
        #    would otherwise default to both.
        headword_generate_template = re.sub(r"^\{\{ru-proper noun\+",
                                            "{{ru-generate-noun-args",
                                            unicode(headword_template))
        headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}",
                                            headword_generate_template)
        headword_generate_result = expand_text(headword_generate_template)
        if not headword_generate_result:
            pagemsg("WARNING: Error generating ru-proper noun+ args")
            return None
        # 2. Fetch actual value of n.
        headword_args = blib.split_generate_args(headword_generate_result)
        headword_n = headword_args["n"]
        # 3. If sg, we always need to set n=sg explicitly in ru-noun-table.
        if headword_n == "s":
            see_template.add("n", "sg")
        # 4. If pl, leave alone, since both will default to plural only if the
        #    lemma is pl, else n=pl needs to be set for both.
        elif headword_n == "p":
            pass
        # 5. If both, n=both had to have been set explicitly in the headword,
        #    but it's the default in ru-noun-table unless the lemma is plural.
        #    So remove n=both, generate the arguments, and see if the actual
        #    value of args.n is b (for "both"); if not, set n=both.
        else:
            assert headword_n == "b"
            rmparam(see_template, "n")
            see_generate_template = re.sub(r"^\{\{ru-noun-table",
                                           "{{ru-generate-noun-args",
                                           unicode(see_template))
            see_generate_result = expand_text(see_generate_template)
            if not see_generate_result:
                pagemsg("WARNING: Error generating ru-noun-table args")
                return None
            see_args = blib.split_generate_args(see_generate_result)
            if see_args["n"] != "b":
                see_template.add("n", "both")

    return unicode(
        parsed
    ), "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode(
        headword_template.name)
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = blib.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = blib.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword:
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runounlib.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runounlib.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Пример #26
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())
    subpagetitle = re.sub(".*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    parsed = blib.parse(page)

    headword_templates = []
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
            headword_templates.append(t)

    headword_template = None
    if len(headword_templates) > 1:
        pagemsg(
            "WARNING: Multiple old-style headword templates, not sure which one to use, using none"
        )
        for ht in headword_templates:
            pagemsg("Ignored headword template: %s" % unicode(ht))
    elif len(headword_templates) == 0:
        pagemsg("WARNING: No old-style headword templates")
    else:
        headword_template = headword_templates[0]
        pagemsg("Found headword template: %s" % unicode(headword_template))

    num_z_decl = 0
    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-decl-noun-z":
            num_z_decl += 1
            pagemsg("Found z-decl template: %s" % unicode(t))
            ru_noun_table_template = runounlib.convert_zdecl_to_ru_noun_table(
                t, subpagetitle, pagemsg, headword_template=headword_template)
            if not ru_noun_table_template:
                pagemsg("WARNING: Unable to convert z-decl template: %s" %
                        unicode(t))
                continue

            if headword_template:
                generate_template = re.sub(r"^\{\{ru-noun-table",
                                           "{{ru-generate-noun-args",
                                           unicode(ru_noun_table_template))
                if unicode(headword_template.name) == "ru-proper noun":
                    generate_template = re.sub(r"\}\}$", "|ndef=sg}}",
                                               generate_template)

                def pagemsg_with_proposed(text):
                    pagemsg("Proposed ru-noun-table template: %s" %
                            unicode(ru_noun_table_template))
                    pagemsg(text)

                generate_result = expand_text(unicode(generate_template))
                if not generate_result:
                    pagemsg_with_proposed(
                        "WARNING: Error generating noun args, skipping")
                    continue
                args = blib.split_generate_args(generate_result)

                # This will check number mismatch and animacy mismatch
                new_genders = runounlib.check_old_noun_headword_forms(
                    headword_template, args, subpagetitle,
                    pagemsg_with_proposed)
                if new_genders == None:
                    continue

            origt = unicode(t)
            t.name = "ru-noun-table"
            del t.params[:]
            for param in ru_noun_table_template.params:
                t.add(param.name, param.value)
            pagemsg("Replacing z-decl %s with regular decl %s" %
                    (origt, unicode(t)))

    if num_z_decl > 1:
        pagemsg("WARNING: Found multiple z-decl templates (%s)" % num_z_decl)

    return unicode(parsed), "Replace ru-decl-noun-z with ru-noun-table"
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []
    pagemsg("Processing")

    heads = None
    headt = None
    headtn = None
    gender_and_animacy = None
    genitives = None
    plurals = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in [args.lang + "-noun", args.lang + "-proper noun"]:
            if heads:
                pagemsg(
                    "WARNING: Encountered headword twice without declension: %s"
                    % unicode(t))
                return
            headt = t
            headtn = tn
            heads = blib.fetch_param_chain(t, "1", "head")
            gender_and_animacy = blib.fetch_param_chain(t, "2", "g")
            genitives = blib.fetch_param_chain(t, "3", "gen")
            plurals = blib.fetch_param_chain(t, "4", "pl")
            genitive_plurals = blib.fetch_param_chain(t, "5", "genpl")
        if tn == args.lang + "-ndecl":
            if not heads:
                pagemsg("WARNING: Encountered decl without headword: %s" %
                        unicode(t))
                return
            generate_template = re.sub(
                r"^\{\{%s-ndecl\|" % args.lang,
                "{{User:Benwing2/%s-generate-prod-noun-props|" % args.lang,
                unicode(t))
            result = expand_text(generate_template)
            if not result:
                return
            new_forms = blib.split_generate_args(result)
            new_g = new_forms["g"].split(",")

            def compare(old, new, stuff, nocanon=False):
                if not old:
                    return True
                if not nocanon:
                    remove_monosyllabic_accents = (
                        uk.remove_monosyllabic_stress if args.lang == "uk" else
                        be.remove_monosyllabic_accents)
                    old = [
                        remove_monosyllabic_accents(blib.remove_links(x))
                        for x in old
                    ]
                    new = [remove_monosyllabic_accents(x) for x in new]
                if set(old) != set(new):
                    pagemsg(
                        "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s"
                        % (stuff, ",".join(old), stuff, ",".join(new),
                           unicode(headt), unicode(t)))
                    return False
                return True

            if not compare(gender_and_animacy, new_g, "gender", nocanon=True):
                heads = None
                continue
            is_plural = [x.endswith("-p") for x in new_g]
            if any(is_plural) and not all(is_plural):
                pagemsg(
                    "WARNING: Mixture of plural-only and non-plural-only genders, can't process: %s"
                    % unicode(t))
                return
            is_plural = any(is_plural)
            if is_plural:
                if (not compare(heads,
                                new_forms.get("nom_p", "-").split(","),
                                "nom pl")
                        or not compare(genitives,
                                       new_forms.get("gen_p", "-").split(","),
                                       "gen pl")):
                    heads = None
                    continue
            else:
                if (not compare(heads,
                                new_forms.get("nom_s", "-").split(","),
                                "nom sg")
                        or not compare(genitives,
                                       new_forms.get("gen_s", "-").split(","),
                                       "gen sg") or
                        # 'uk/be-proper noun' headwords don't have nominative plural set
                        headtn == args.lang + "-noun" and not compare(
                            plurals,
                            new_forms.get("nom_p", "-").split(","), "nom pl")
                        or headtn == args.lang + "-noun" and not compare(
                            genitive_plurals,
                            new_forms.get("gen_p", "-").split(","), "gen pl")):
                    heads = None
                    continue
            decl = getparam(t, "1")
            blib.set_param_chain(headt, [decl], "1", "head")
            blib.remove_param_chain(headt, "2", "g")
            blib.remove_param_chain(headt, "3", "gen")
            blib.remove_param_chain(headt, "4", "pl")
            blib.remove_param_chain(headt, "5", "genpl")
            notes.append("convert {{%s}} to new style using decl %s" %
                         (unicode(headt.name), decl))
            heads = None
    return unicode(parsed), notes
Пример #28
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errpagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))
        errmsg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        if tname(t) in [
                "ru-conj", "ru-conj-old", "User:Benwing2/ru-conj",
                "User:Benwing2/ru-conj-old"
        ] or tname(t) == "temp" and getparam(t, "1") == "ru-conj":
            verb_type, arg_sets = split_ru_conj_args(t, tname(t) == "temp")
            refl = "refl" in verb_type
            orig_arg_sets = copy.deepcopy(arg_sets)
            rm_pres_stem = False

            ##### First, modify arg_sets according to normalized params

            for arg_set in arg_sets:
                # This complex spec matches matches 3°a, 3oa, 4a1a, 6c1a,
                # 1a6a, 6a1as13, 6a1as14, etc.
                m = re.search(u"^([0-9]+[°o0-9abc]*[abc]s?1?[34]?)",
                              arg_set[0])
                if not m:
                    m = re.search(
                        u"^(irreg-?[абцдеѣфгчийклмнопярстувшхызёюжэщьъ%-]*)",
                        arg_set[0])
                    if not m:
                        errpagemsg("Unrecognized conjugation type: %s" %
                                   arg_set[0])
                        continue
                conj_type = m.group(1).replace("o", u"°")
                inf, tr = rulib.split_russian_tr(arg_set[1])
                if refl:
                    new_style = re.search(u"([тч]ься|ти́?сь)$", inf)
                else:
                    new_style = re.search(
                        u"([тч]ь|ти́?)$" if conj_type.startswith("7")
                        or conj_type.startswith("irreg") else u"[тч]ь$", inf)
                if new_style:
                    if arg_set[0].startswith("irreg-"):
                        arg_set[0] = re.sub("^irreg-.*?(/.*|$)", r"irreg\1",
                                            arg_set[0])
                    arg_set[1] = rulib.paste_russian_tr(
                        rulib.remove_monosyllabic_accents(inf),
                        rulib.remove_tr_monosyllabic_accents(tr))
                else:
                    if not re.search("^[124]", conj_type):
                        assert not tr
                    if conj_type in ["1a", "2a", "2b"]:
                        inf += u"ть"
                        if tr:
                            tr += u"tʹ"
                    elif conj_type in ["3a", u"3°a"]:
                        inf += u"нуть"
                    elif conj_type in ["3b", u"3c"]:
                        inf += u"у́ть"
                    elif conj_type == "4a":
                        inf += u"ить"
                        if tr:
                            tr += u"itʹ"
                    elif conj_type in ["4b", "4c"]:
                        inf, tr = rulib.make_unstressed(
                            inf, rulib.decompose(tr))
                        inf += u"ить"
                        if tr:
                            tr += u"ítʹ"
                    elif conj_type == "4a1a":
                        inf = re.sub(u"[ая]$", "", inf) + u"ить"
                        if tr:
                            tr = re.sub("j?a$", "", tr) + u"itʹ"
                    elif conj_type == "5a":
                        inf = arg_set[2] + u"ть" if arg_set[
                            2] else arg_set[1] + u"еть"
                        normal_pres_stem = re.sub(u"[еая]ть$", "", inf)
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif conj_type == "5b":
                        inf = arg_set[2] + u"ть"
                        normal_pres_stem = re.sub(u"[еая]́ть$", "", inf)
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif conj_type == "5c":
                        inf = arg_set[2] + u"ть"
                        normal_pres_stem = rulib.make_ending_stressed_ru(
                            re.sub(u"[еая]́ть$", "", inf))
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif re.search(u"^6°?a", conj_type) or conj_type == "1a6a":
                        assert not arg_set[3]
                        if arg_set[2]:
                            inf = arg_set[2] + u"ть"
                            arg_set[2] = ""
                            normal_pres_stem = rulib.make_ending_stressed_ru(
                                re.sub(u"а́ть$", "", inf))
                            assert arg_set[1] == normal_pres_stem
                        elif is_vowel_stem(inf):
                            inf += u"ять"
                        else:
                            inf += u"ать"
                        if getparam(t, "pres_stem"):
                            arg_set[2] = getparam(t, "pres_stem")
                            rm_pres_stem = True
                    elif re.search(u"^6°?b", conj_type):
                        if is_vowel_stem(inf):
                            inf += u"я́ть"
                        else:
                            inf += u"а́ть"
                        # arg_set[2] (present stem) remains
                    elif re.search(u"^6°?c", conj_type):
                        inf = rulib.make_unstressed_once_ru(inf) + u"а́ть"
                    elif conj_type in ["7a", "7b"]:
                        pass  # nothing needed to do
                    elif conj_type in ["8a", "8b"]:
                        inf = arg_set[2]
                        arg_set[2] = arg_set[1]
                    elif conj_type == "9a":
                        inf += u"еть"
                        # arg_set[2] (present stem) remains
                    elif conj_type == "9b":
                        inf = rulib.make_unstressed_once_ru(inf) + u"е́ть"
                        # arg_set[2] (present stem) remains
                        # arg_set[3] (optional past participle stem) remains
                    elif conj_type == "10a":
                        inf += u"оть"
                    elif conj_type == "10c":
                        inf += u"ть"
                        if rulib.make_unstressed_once_ru(arg_set[2]) == re.sub(
                                u"о́$", "", arg_set[1]):
                            arg_set[2] = ""
                    elif conj_type == "11a":
                        inf += u"ить"
                    elif conj_type == "11b":
                        inf += u"и́ть"
                        if arg_set[2] == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "12a":
                        inf += u"ть"
                        if arg_set[2] == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "12b":
                        inf += u"ть"
                        if rulib.make_ending_stressed_ru(
                                arg_set[2]) == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "13b":
                        inf += u"ть"
                        assert re.sub(u"ва́ть$", "", inf) == arg_set[2]
                        arg_set[2] = ""
                    elif conj_type in ["14a", "14b", "14c"]:
                        inf += u"ть"
                        # arg_set[2] (present stem) remains
                    elif conj_type in ["15a", "16a", "16b"]:
                        inf += u"ть"
                    elif conj_type == u"irreg-минуть":
                        inf = u"мину́ть"
                    elif conj_type == u"irreg-живописать-миновать":
                        inf += u"ть"
                        arg_set[2] = ""
                    elif conj_type == u"irreg-слыхать-видать":
                        inf += u"ть"
                    elif conj_type == u"irreg-стелить-стлать":
                        inf = arg_set[2] + inf + u"ть"
                        arg_set[2] = ""
                        arg_set[3] = ""
                    elif conj_type == u"irreg-ссать-сцать":
                        assert arg_set[2] == re.sub(u"а́$", "", inf)
                        inf = arg_set[3] + inf + u"ть"
                        arg_set[2] = ""
                        arg_set[3] = ""
                    elif conj_type in [
                            u"irreg-сыпать", u"irreg-ехать", u"irreg-ѣхать"
                    ]:
                        infstem = re.sub("^irreg-", "", conj_type)
                        if arg_set[1] != u"вы́":
                            infstem = rulib.make_beginning_stressed_ru(infstem)
                        inf = arg_set[1] + infstem
                    elif conj_type == u"irreg-обязывать":
                        if arg_set[1] == u"вы́":
                            inf = u"вы́обязывать"
                        else:
                            inf = arg_set[1] + u"обя́зывать"
                    elif conj_type == u"irreg-зиждиться":
                        if arg_set[1] == u"вы́":
                            inf = u"вы́зиждить"
                        else:
                            inf = arg_set[1] + u"зи́ждить"
                    elif conj_type == u"irreg-идти":
                        if not arg_set[1]:
                            inf = u"идти́"
                        elif arg_set[1] == u"вы́":
                            inf = u"вы́йти"
                        else:
                            inf = arg_set[1] + u"йти́"
                    elif re.search("^irreg-", conj_type):
                        infstem = re.sub("^irreg-", "", conj_type)
                        if arg_set[1] != u"вы́":
                            infstem = rulib.make_ending_stressed_ru(infstem)
                        inf = arg_set[1] + infstem
                    else:
                        error("Unknown conjugation type " + conj_type)
                    if inf:
                        if refl:
                            if re.search(u"[тч]ь$", inf):
                                inf += u"ся"
                                if tr:
                                    tr += "sja"
                            else:
                                assert re.search(u"и́?$", inf)
                                inf += u"сь"
                                if tr:
                                    tr += u"sʹ"
                        arg_set[1] = rulib.paste_russian_tr(
                            rulib.remove_monosyllabic_accents(inf),
                            rulib.remove_tr_monosyllabic_accents(tr))

            ##### If something changed ...

            if orig_arg_sets != arg_sets or rm_pres_stem:

                ##### ... compare the forms generated by the original and new
                ##### arguments and make sure they're the same.

                if not pagetitle.startswith("User:Benwing2/"):
                    # 1. Generate and expand the appropriate call to
                    #    {{ru-generate-verb-forms}} for the original arguments.

                    orig_args = paste_arg_sets(orig_arg_sets,
                                               t,
                                               verb_type,
                                               rm_pres_stem=False,
                                               as_string=True)
                    orig_tempcall = "{{ru-generate-verb-forms|%s%s}}" % (
                        "|".join(orig_args),
                        "|old=1" if tname(t).endswith("ru-conj-old") else "")
                    orig_result = expand_text(orig_tempcall)
                    if not orig_result:
                        errpagemsg(
                            "WARNING: Error expanding original template %s" %
                            orig_tempcall)
                        continue
                    orig_forms = blib.split_generate_args(orig_result)

                    # 2. Generate and expand the appropriate call to
                    #    {{ru-generate-verb-forms}} for the new arguments.

                    new_args = paste_arg_sets(arg_sets,
                                              t,
                                              verb_type,
                                              rm_pres_stem,
                                              as_string=True)
                    new_tempcall = "{{ru-generate-verb-forms|%s%s}}" % (
                        "|".join(new_args),
                        "|old=1" if tname(t).endswith("ru-conj-old") else "")
                    new_result = expand_text(new_tempcall)
                    if not new_result:
                        errpagemsg("WARNING: Error expanding new template %s" %
                                   new_tempcall)
                        continue
                    new_forms = blib.split_generate_args(new_result)

                    # 3. Compare each form and accumulate a list of mismatches.

                    all_keys = set(orig_forms.keys()) | set(new_forms.keys())

                    def sort_numbers_first(key):
                        if re.search("^[0-9]+$", key):
                            return "%05d" % int(key)
                        return key

                    all_keys = sorted(list(all_keys), key=sort_numbers_first)
                    mismatches = []
                    for key in all_keys:
                        origval = orig_forms.get(key, "<<missing>>")
                        newval = new_forms.get(key, "<<missing>>")
                        if origval != newval:
                            mismatches.append("%s: old=%s new=%s" %
                                              (key, origval, newval))

                    # 4. If mismatches, output them and don't change anything.

                    if mismatches:
                        errpagemsg(
                            "WARNING: Mismatch comparing old %s to new %s: %s"
                            % (orig_tempcall, new_tempcall,
                               " || ".join(mismatches)))
                        continue

                # 5. If no mismatches, modify the template to contain the new args.

                new_params = paste_arg_sets(arg_sets,
                                            t,
                                            verb_type,
                                            rm_pres_stem,
                                            as_string=False,
                                            is_temp=tname(t) == "temp")
                del t.params[:]
                if tname(t) == "temp":
                    t.add("1", "ru-conj")
                for name, value in new_params:
                    t.add(name, value)

                # 6. Build up the save comment.

                orig_changed_params = paste_arg_sets(orig_arg_sets,
                                                     t,
                                                     verb_type,
                                                     rm_pres_stem=False,
                                                     as_string=True,
                                                     change_only=True)
                new_changed_params = paste_arg_sets(arg_sets,
                                                    t,
                                                    verb_type,
                                                    rm_pres_stem,
                                                    as_string=True,
                                                    change_only=True)
                notes.append("ru-conj: normalized %s to %s" %
                             ("|".join(orig_changed_params),
                              "|".join(new_changed_params)))

            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Пример #29
0
def lookup_heads_and_inflections(pagename, pagemsg):
    if semi_verbose:
        pagemsg("lookup_heads_and_inflections: Finding heads on page %s" %
                pagename)

    # Use our own expand_text() rather than passing it from the caller,
    # which may have a different value for PAGENAME; the proper value is
    # important in expanding certain templates e.g. ru-generate-adj-forms.
    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose)

    if pagename in terms_to_ignore:
        pagemsg(
            "lookup_heads_and_inflections: Ignoring term because in terms_to_ignore: %s"
            % pagename)
        return "manual-override", None

    if pagename in manually_specified_inflections:
        accented, lemma = manually_specified_inflections[pagename]
        if lemma is True:
            return "manual-override", ({(accented, "", True)}, set(), set())
        else:
            return "manual-override", ({(accented, "", False)}, {
                (frozenset({(accented, "", False)}), lemma)
            }, set())

    global num_cache_lookups
    num_cache_lookups += 1
    if pagename in accented_cache:
        global num_cache_hits
        num_cache_hits += 1
        result = accented_cache[pagename]
        if result is None:
            if semi_verbose:
                pagemsg(
                    "lookup_heads_and_inflections: Page %s doesn't exist (cached)"
                    % pagename)
        elif result == "redirect":
            if semi_verbose:
                pagemsg(
                    "lookup_heads_and_inflections: Page %s is redirect (cached)"
                    % pagename)
        elif result == "no-russian":
            if semi_verbose:
                pagemsg(
                    "lookup_heads_and_inflections: Page %s has no Russian section (cached)"
                    % pagename)
        return True, result
    elif "\n" in pagename:
        pagemsg(
            "WARNING: lookup_heads_and_inflections: Bad pagename (has newline in it): %s"
            % pagename)
        if not global_disable_cache:
            accented_cache[pagename] = None
        return False, None
    else:
        cached = False
        page = pywikibot.Page(site, pagename)
        try:
            if not page.exists():
                if semi_verbose:
                    pagemsg(
                        "lookup_heads_and_inflections: Page %s doesn't exist" %
                        pagename)
                if not global_disable_cache:
                    accented_cache[pagename] = None
                return False, None
        except Exception as e:
            pagemsg(
                "WARNING: lookup_heads_and_inflections: Error checking page existence: %s"
                % unicode(e))
            if not global_disable_cache:
                accented_cache[pagename] = None
            return False, None

        # Page exists, is it a redirect?
        if re.match("#redirect", page.text, re.I):
            if not global_disable_cache:
                accented_cache[pagename] = "redirect"
            pagemsg("lookup_heads_and_inflections: Page %s is redirect" %
                    pagename)
            return False, "redirect"

        # Page exists and is not a redirect, find the info
        heads = set()
        inflections_of = set()
        adj_forms = set()

        foundrussian = False
        sections = re.split("(^==[^=]*==\n)", unicode(page.text), 0, re.M)

        for j in xrange(2, len(sections), 2):
            if sections[j - 1] == "==Russian==\n":
                if foundrussian:
                    pagemsg(
                        "WARNING: lookup_heads_and_inflections: Found multiple Russian sections"
                    )
                    break
                foundrussian = True

                subsections = re.split("(^===+[^=\n]+===+\n)", sections[j], 0,
                                       re.M)
                for k in xrange(2, len(subsections), 2):
                    parsed = blib.parse_text(subsections[k])
                    this_heads = set()

                    def add(val, tr, is_lemma):
                        val_to_add = blib.remove_links(val)
                        # Remove monosyllabic accents to correctly handle the case of
                        # рад, which has some heads with an accent and some without.
                        val_to_add, tr = remove_monosyllabic_accents(
                            val_to_add, tr)
                        this_heads.add((val_to_add, tr, is_lemma))

                    for t in parsed.filter_templates():
                        tname = unicode(t.name)
                        check_addl_heads = False
                        if tname in ru_head_templates:
                            is_lemma = tname in ru_lemma_templates
                            check_addl_heads = True
                            if getparam(t, "1"):
                                add(getparam(t, "1"), getparam(t, "tr"),
                                    is_lemma)
                            elif getparam(t, "head"):
                                add(getparam(t, "head"), getparam(t, "tr"),
                                    is_lemma)
                            else:
                                add(pagename, "", is_lemma)
                        elif tname == "head" and getparam(t, "1") == "ru":
                            is_lemma = getparam(t, "2") in ru_lemma_poses
                            check_addl_heads = True
                            if getparam(t, "head"):
                                add(getparam(t, "head"), getparam(t, "tr"),
                                    is_lemma)
                            else:
                                add(pagename, "", is_lemma)
                        elif tname in ["ru-noun+", "ru-proper noun+"]:
                            is_lemma = True
                            lemma = rulib.fetch_noun_lemma(t, expand_text)
                            lemmas = re.split(",", lemma)
                            lemmas = [
                                split_ru_tr(lemma, pagemsg) for lemma in lemmas
                            ]
                            # Group lemmas by Russian, to group multiple translits
                            lemmas = rulib.group_translits(
                                lemmas, pagemsg, semi_verbose)
                            for val, tr in lemmas:
                                add(val, tr, is_lemma)
                        elif (tname == "ru-participle of"
                              or tname in inflection_templates
                              and getparam(t, "lang") == "ru"):
                            inflections_of.add(
                                (frozenset(this_heads),
                                 normalize_text(getparam(t, "1"))))
                        if check_addl_heads:
                            for i in xrange(2, 10):
                                headn = getparam(t, "head" + str(i))
                                if headn:
                                    add(headn, getparam(t, "tr" + str(i)),
                                        is_lemma)
                        elif tname == "ru-decl-adj":
                            result = expand_text(
                                re.sub(r"^\{\{ru-decl-adj",
                                       "{{ru-generate-adj-forms", unicode(t)))
                            if not result:
                                pagemsg(
                                    "WARNING: lookup_heads_and_inflections: Error expanding template %s, page %s"
                                    % (unicode(t), pagename))
                            else:
                                args = blib.split_generate_args(result)
                                for value in args.itervalues():
                                    adj_forms.add(value)
                    heads.update(this_heads)

        # Page exists, is it a redirect?
        if not foundrussian:
            if not global_disable_cache:
                accented_cache[pagename] = "no-russian"
            pagemsg(
                "lookup_heads_and_inflections: Page %s has no Russian section"
                % pagename)
            return False, "no-russian"

        saw_lemma = any(is_lemma for ru, tr, is_lemma in heads)
        if not saw_lemma and not inflections_of:
            # If no lemmas or inflections found, check for alt-ё templates.
            # If the term is a non-ё variant of a single term with ё, look up
            # and return the heads and inflections on that page.
            parsed = blib.parse_text(unicode(page.text))
            yo_pages = set()
            for t in parsed.filter_templates():
                if unicode(t.name) in alt_yo_templates:
                    yo_pages.add(getparam(t, "1"))
            if len(yo_pages) > 1:
                pagemsg(
                    u"WARNING: lookup_heads_and_inflections: Found multiple alt-ё templates for different lemmas: %s"
                    % ",".join(yo_pages))
            elif len(yo_pages) == 0:
                pagemsg(
                    "WARNING: lookup_heads_and_inflections: Found no lemmas or inflections of lemmas for %s"
                    % pagename)
            else:
                yoful_page = list(yo_pages)[0]
                pagemsg(
                    "lookup_heads_and_inflections: Redirecting from %s to %s" %
                    (pagename, yoful_page))
                return lookup_heads_and_inflections(yoful_page, pagemsg)

        cacheval = (heads, inflections_of, adj_forms)
        if not global_disable_cache:
            accented_cache[pagename] = cacheval
        return False, cacheval
Пример #30
0
    for param in generate_template.params:
        proposed_decl.add(param.name, param.value)

    def pagemsg_with_proposed(text):
        pagemsg(
            "Proposed new template (WARNING, omits explicit gender and params to preserve from old template): %s"
            % proposed_template_text)
        pagemsg(text)

    if headword_is_proper:
        generate_template.add("ndef", "sg")
    generate_result = expand_text(unicode(generate_template))
    if not generate_result:
        pagemsg_with_proposed("WARNING: Error generating noun args, skipping")
        return
    genargs = blib.split_generate_args(generate_result)
    if headword_is_proper and genargs["n"] == "s" and not getparam(
            proposed_decl, "n"):
        proposed_decl.add("n", "sg")

    # This will check number mismatch (and animacy mismatch, but that shouldn't
    # occur as we've taken the animacy directly from the headword)
    new_genders = runounlib.check_old_noun_headword_forms(
        headword_template,
        genargs,
        subpagetitle,
        pagemsg_with_proposed,
        laxer_comparison=True)
    if new_genders == None:
        return None