Пример #1
0
 def insert_into_existing_pron_section(k):
   parsed = blib.parse_text(subsections[k])
   for t in parsed.filter_templates():
     tn = tname(t)
     if tn in pronun_templates:
       pagemsg("Already saw pronunciation template: %s" % unicode(t))
       break
   else: # no break
     new_pron_template, pron_prefix = construct_new_pron_template()
     # Remove existing rhymes/hyphenation/pl-IPA lines
     for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]:
       re_template = template.replace("|", r"\|")
       regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
       m = re.search(regex, subsections[k], re.M)
       if m:
         pagemsg("Removed existing %s" % m.group(1).strip())
         notes.append("remove existing {{%s}}" % template)
         subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
     for template in ["audio|pl"]:
       re_template = template.replace("|", r"\|")
       regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
       all_audios = re.findall(regex, subsections[k], re.M)
       if len(all_audios) > 1:
         pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios()))
         return
       if len(all_audios) == 1:
         audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0]
         assert(tname(audiot) == "audio")
         if getparam(audiot, "1") != "pl":
           pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line)
           return
         audiofile = getparam(audiot, "2")
         audiogloss = getparam(audiot, "3")
         for param in audiot.params:
           pn = pname(param)
           pv = unicode(param.value)
           if pn not in ["1", "2", "3"]:
             pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
               pn, pv, audio_line))
             return
         if audiogloss in ["Audio", "audio"]:
           audiogloss = ""
         params = "|a=%s" % audiofile
         if audiogloss:
           params += "|ac=%s" % audiogloss
         new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:]
         pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip())
         notes.append("incorporate existing {{%s}} into {{pl-p}}" % template)
         subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
     subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k]
     notes.append("insert %s into existing Pronunciation section" % new_pron_template)
   return True
def add_category(secbody, sectail, pagemsg, notes, cat):
    separator = ""
    m = re.match(r"^(.*?\n)(\n*--+\n*)$", sectail, re.S)
    if m:
        sectail, separator = m.groups()
    if re.search(r"\[\[Category:%s(\||\])" % re.escape(cat),
                 secbody + sectail):
        # Category already present
        pagemsg("Category 'Hungarian %s' already present" % cat)
        return secbody, sectail + separator
    parsed = blib.parse_text(secbody + sectail)
    for t in parsed.filter_templates():
        if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu":
            for i in range(2, 30):
                if getparam(t, str(i)) == cat:
                    # Category already present in templatized form
                    pagemsg("Category 'Hungarian %s' already present" % cat)
                    return secbody, sectail + separator

    # Now add the category to existing {{cln}}, or create one.
    parsed = blib.parse_text(sectail)
    for t in parsed.filter_templates():
        if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu":
            for i in range(2, 30):
                if not getparam(t, str(i)):
                    break
            else:  # no break
                pagemsg(
                    "WARNING: Something strange, reached 30= in %s and didn't see place to insert"
                    % unicode(t))
                return secbody, sectail + separator
            before = str(i + 1) if getparam(
                t, str(i + 1)) else "sort" if getparam(t, "sort") else None
            origt = unicode(t)
            t.add(str(i), cat, before=before)
            notes.append("insert '%s' into existing {{%s|hu}}" %
                         (cat, tname(t)))
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            return secbody, unicode(parsed) + separator
    # Need to create {{cln}}.
    newtext = "{{cln|hu|%s}}" % cat
    sectail = sectail.strip()
    if sectail:
        sectail = sectail + "\n" + newtext
    else:
        sectail = newtext
    notes.append("add %s" % newtext)
    pagemsg("Added %s" % newtext)
    return secbody.rstrip(
        "\n") + "\n", "\n" + sectail + "\n\n" + separator.lstrip("\n")
Пример #3
0
  def fix_up_section(sectext, warn_on_multiple_heads):
    parsed = blib.parse_text(sectext)

    heads = set()
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if lalib.la_template_is_head(t):
        heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg))
      elif tn == "la-IPA":
        pronun_templates.append(t)
    if len(heads) > 1:
      if warn_on_multiple_heads:
        pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    if len(heads) == 0:
      pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext)
    newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M)
    if newsectext != sectext:
      notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0])
      sectext = newsectext
    # Recompute pronun templates as we may have added one.
    parsed = blib.parse_text(sectext)
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "la-IPA":
        pronun_templates.append(t)
    if "{{a|Ecclesiastical}} {{IPA" in sectext:
      if len(pronun_templates) == 0:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template")
      elif len(pronun_templates) > 1:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" %
          ",".join(unicode(tt) for tt in pronun_templates))
      else:
        origt = unicode(pronun_templates[0])
        pronun_templates[0].add("eccl", "yes")
        pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0])))
        newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "",
            sectext, 0, re.M)
        if newsectext == sectext:
          pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation")
        else:
          notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}")
          sectext = newsectext
    return sectext
 def combine_doublets(m):
   first = blib.parse_text(m.group(1))
   rest = blib.parse_text(m.group(2))
   t1 = list(first.filter_templates())[0]
   if getparam(t1, "3") or getparam(t1, "4") or getparam(t1, "alt2") or getparam(t1, "alt3"):
     pagemsg("WARNING: Can't combine %s, first template already has multiple terms" %
         m.group(0))
     return m.group(0)
   next_index = 2
   lang = getparam(t1, "1")
   for t in rest.filter_templates(recursive=False):
     tlang = getparam(t, "1")
     if lang != tlang:
       pagemsg("WARNING: Lang %s in continuation template %s not same as lang %s in first template %s" % (
         tlang, unicode(t), lang, unicode(t1)))
       return m.group(0)
     for param in t.params:
       pname = unicode(param.name).strip()
       pval = unicode(param.value).strip()
       if not pval:
         continue
       if pname == "2":
         t1.add(str(next_index + 1), pval)
       elif pname == "3":
         t1.add("alt%s" % next_index, pval)
       elif pname == "4":
         t1.add("t%s" % next_index, pval)
       elif pname in ["t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc",
           "id", "g"]:
         t1.add("%s%s" % (pname, next_index), pval)
       elif pname in ["t1", "gloss1", "tr1", "ts1", "pos1", "lit1", "alt1", "sc1",
           "id1", "g1"]:
         t1.add("%s%s" % (pname[:-1], next_index), pval)
       elif pname in ["1", "notext", "nocap", "nocat"]:
         pass
       else:
         pagemsg("WARNING: Unrecognized param %s=%s in %s, skipping" %
             (pname, pval, unicode(t)))
         return m.group(0)
     next_index += 1
   for param in ["notext", "nocap", "nocat"]:
     val = getparam(t1, param)
     rmparam(t1, param)
     if val:
       t1.add(param, val)
   newtext = unicode(t1)
   pagemsg("Replaced %s with %s" % (m.group(0), newtext))
   return newtext
Пример #5
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)

        def getp(param):
            return getparam(t, param)

        if tn == "Wikisource1911Enc Citation":
            origt = unicode(t)
            param1 = getp("1")
            t.add("1", "1911")
            t.add("2", param1)
            blib.set_template_name(t, "projectlink")
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append(
                    "convert {{Wikisource1911Enc Citation}} to {{projectlink|1911}}"
                )

    return unicode(parsed), notes
Пример #6
0
def process_page(index, page, verbose):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    russian = blib.find_lang_section_from_text(text, "Russian", pagemsg)
    if not russian:
        pagemsg("Couldn't find Russian section for %s" % pagetitle)
        return

    subsections = re.split("(^===+[^=\n]+===+\n)", russian, 0, re.M)
    # Go through each subsection in turn, looking for subsection
    # matching the POS with an appropriate headword template whose
    # head matches the inflected form
    for j in xrange(2, len(subsections), 2):
        if "==Etymology" in subsections[j - 1]:
            parsed = blib.parse_text(subsections[j])
            for t in parsed.filter_templates():
                tname = unicode(t.name)
                if tname == "diminutive of":
                    pagemsg("WARNING: Found diminutive-of in etymology: %s" %
                            unicode(t))
Пример #7
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)

        def getp(param):
            return getparam(t, param)

        if tn == "mn-variant":
            origt = unicode(t)
            m = getp("m")
            if m:
                t.add("1", m, before="m")
                t.add("2", m, before="m")
            c = getp("c")
            if c:
                t.add("3", c, before="c")
            rmparam(t, "m")
            rmparam(t, "c")
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append(
                    "Convert m=/c= in {{mn-variant}} to numbered params")

    return unicode(parsed), notes
Пример #8
0
def process_section(index, pagetitle, sectext):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    parsed = blib.parse_text(sectext)
    head = None
    for t in parsed.filter_templates():
        newhead = get_head_param(t, pagetitle)
        if newhead is not None:
            newhead = [blib.remove_links(x) for x in newhead]
            if head and head != newhead:
                pagemsg("WARNING: Saw multiple heads %s and %s" %
                        (",".join(head), ",".join(newhead)))
            head = newhead
    if not head:
        pagemsg("WARNING: Couldn't find head")
    saw_pronun = False
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "IPA":
            if getparam(t, "1") != "ang":
                pagemsg("WARNING: Wrong-language IPA template: %s" %
                        unicode(t))
                continue
            pagemsg("<from> %s <to> {{ang-IPA|%s}} <end>" %
                    (unicode(t), "|".join(head) or "<<%s>>" % pagetitle))
            saw_pronun = True
        elif tn == "ang-IPA":
            pagemsg("Saw existing pronunciation: %s" % unicode(t))
            saw_pronun = True
    if not saw_pronun:
        pagemsg(
            "WARNING: Didn't see pronunciation for headword %s <new> {{ang-IPA|%s}} <end>"
            % (",".join(head), "|".join(head)))
Пример #9
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    text = unicode(page.text)

    notes = []
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        name = unicode(t.name)
        if name in fr_head_templates:
            rmparam(t, "sort")
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replacing %s with %s" % (origt, newt))
            notes.append("remove sort= from {{%s}}" % name)

    return unicode(parsed), notes
def process_page(index, page):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)

    foundrussian = False
    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    for j in xrange(2, len(sections), 2):
        if sections[j - 1] == "==Russian==\n":
            if foundrussian:
                pagemsg(
                    "WARNING: Found multiple Russian sections, skipping page")
                return
            foundrussian = True

            found_headword_template = False
            parsed = blib.parse_text(sections[j])
            for t in parsed.filter_templates():
                tname = unicode(t.name)
                if tname == "ru-adj" or (tname == "head"
                                         and getparam(t, "1") == "ru" and
                                         getparam(t, "2") == "adjective form"):
                    found_headword_template = True
            if not found_headword_template and "===Adjective===" in sections[j]:
                pagemsg("WARNING: Missing adj headword template")
Пример #11
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    param = None
    if tn in ["bg-noun", "bg-proper noun", "bg-verb", "bg-adj", "bg-adv",
        "bg-part", "bg-part form", "bg-verbal noun", "bg-verbal noun form",
        "bg-phrase"]:
      param = "1"
    elif tn == "head" and getparam(t, "1") == "bg":
      param = "head"
    if param:
      val = getparam(t, param)
      val = bglib.decompose(val)
      if GR in val:
        val = val.replace(GR, AC)
        t.add(param, val)
        notes.append("convert grave to acute in {{%s}}" % tn)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return unicode(parsed), notes
Пример #12
0
def do_process_text_on_page(index, pagename, text, adj):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagename, txt))

  pagemsg("Processing")

  notes = []

  if "==Etymology 1==" in text or "==Pronunciation 1==" in text:
    pagemsg("WARNING: Saw Etymology/Pronunciation 1, can't handle yet")
    return

  parsed = blib.parse_text(text)
  headword = None
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in (adj and ["bg-adj"] or ["bg-noun", "bg-proper noun"]):
      headword = getparam(t, "1")
    if (tn == "bg-decl-adj" if adj else tn.startswith("bg-noun-")):
      origt = unicode(t)
      if not headword:
        pagemsg("WARNING: Saw %s without {{%s}} headword" % (origt, "bg-adj" if adj else "bg-noun"))
        continue
      del t.params[:]
      t.add("1", "%s<>" % headword)
      blib.set_template_name(t, "bg-adecl" if adj else "bg-ndecl")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{%s}} to {{%s}}" % (tn, tname(t)))

  return text, notes
def process_text_on_page(index, pagename, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagename, txt))

    pagemsg("Processing")

    notes = []

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn == "RQ:Buk Baibel":
            param1 = getparam(t, "1")
            if param1 in book_map:
                t.add("1", book_map[param1])
                notes.append("convert '%s' to '%s' in 1= in {{%s}}" %
                             (param1, book_map[param1], tn))
            param4 = getparam(t, "4")
            if param4:
                t.add("passage", param4, before="4")
                rmparam(t, "4")
                notes.append("4= -> passage= in {{%s}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
def process_text_on_page(index, pagename, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagename, txt))

    pagemsg("Processing")

    notes = []

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "head" and getparam(t, "1") == "la":
            pos = getparam(t, "2")
            if pos not in pos_to_template:
                pagemsg("WARNING: Saw unrecognized part of speech %s: %s" %
                        (pos, unicode(t)))
                continue
            if getparam(t, "3") or getparam(t, "head"):
                pagemsg("WARNING: Saw 3= or head=: %s" % unicode(t))
                continue
            origt = unicode(t)
            t.add("1", pagename)
            blib.set_template_name(t, pos_to_template[pos])
            rmparam(t, "2")
            t.add("FIXME", "1")
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            notes.append("replace {{head|la|%s}} with {{%s}}" %
                         (pos, tname(t)))

    return unicode(parsed), notes
Пример #15
0
def investigate_possible_adj(index, adj_pagename, adv, adv_defns):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, adj_pagename, txt))

    pagemsg("Trying for adverb %s" % adv)
    page = pywikibot.Page(site, adj_pagename)
    if not page.exists():
        pagemsg("Doesn't exist for adverb %s" % adv)
        return

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in ["la-adj", "la-part"]:
                adj = lalib.la_get_headword_from_template(
                    t, adj_pagename, pagemsg)[0]
                adj_defns = lalib.find_defns(subsections[k])
                msg("%s /// %s /// %s /// %s" %
                    (adv, adj, ";".join(adv_defns), ";".join(adj_defns)))
Пример #16
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if "es-IPA" not in text and "fr-IPA" not in text and "it-IPA" not in text:
        return

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn in ["es-IPA", "fr-IPA", "it-IPA"]:
            must_continue = False
            for i in xrange(2, 11):
                if getparam(t, str(i)):
                    pagemsg("Template has %s=, not touching: %s" % (i, origt))
                    must_continue = True
                    break
            if must_continue:
                continue
            par1 = getparam(t, "1")
            if par1 == pagetitle:
                rmparam(t, "1")
                notes.append("remove redundant 1=%s from {{%s}}" % (par1, tn))
            if unicode(t) != origt:
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
Пример #17
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  parsed = blib.parse_text(text)

  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn in ["hi-noun form", "hi-verb form", "hi-adj form"]:
      g = getparam(t, "g")
      newg = None
      if g == "ms":
        newg = "m-s"
      elif g == "fs":
        newg = "f-s"
      elif g == "mp":
        newg = "m-p"
      elif g == "fp":
        newg = "f-p"
      if g != newg:
        t.add("g", newg)
        notes.append("fix gender in {{%s}}" % tn)
      if unicode(t) != origt:
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
Пример #18
0
def process_text_on_page_for_single_word(index, pagename, text, spec):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagename, txt))

  pagemsg("Processing")

  notes = []

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "es-verb":
      if not getparam(t, "attn"):
        pagemsg("Didn't see attn=1: %s" % unicode(t))
        continue
      rmparam(t, "attn")
      if "<" in spec:
        t.add("1", "%s%s" % (pagename, spec))
        notes.append("add conjugation %s%s to Spanish verb" % (pagename, spec))
      elif spec == "*":
        notes.append("add conjugation (default) to Spanish verb")
      else:
        t.add("pres", spec)
        notes.append("add conjugation pres=%s to Spanish verb" % spec)
    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
Пример #19
0
def process_text_on_page_for_full_conj(index, pagename, text, verbs):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagename, txt))

  pagemsg("Processing")

  notes = []

  if pagename not in verbs:
    pagemsg("WARNING: Can't find entry, skipping")
    return

  entry = verbs[pagename]
  origentry = entry
  first, rest = pagename.split(" ", 1)
  restwords = rest.split(" ")
  def_link = "%s<> %s" % (first, " ".join("[[%s]]" % word for word in restwords))
  if def_link == entry:
    pagemsg("Replacing entry '%s' with a blank entry because it's the default" % entry)
    entry = ""
  elif re.sub("<.*?>", "<>", entry) == def_link:
    newentry = blib.remove_links(entry)
    pagemsg("Replacing entry '%s' with entry without links '%s'" % (entry, newentry))
    entry = newentry

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "es-verb":
      if not getparam(t, "attn"):
        pagemsg("Didn't see attn=1: %s" % unicode(t))
        continue
      rmparam(t, "attn")
      if entry:
        t.add("1", entry)
        notes.append("add conjugation '%s' to Spanish verb" % entry)
      else:
        notes.append("add conjugation (default) to Spanish verb")
    if tn == "head" and getparam(t, "1") == "es" and getparam(t, "2") == "verb":
      head = getparam(t, "head")
      if head:
        pagemsg("WARNING: Removing head=%s compared with entry '%s', original entry '%s': %s" %
            (head, entry, origentry, unicode(t)))
        rmparam(t, "head")
      rmparam(t, "2")
      rmparam(t, "1")
      blib.set_template_name(t, "es-verb")
      if entry:
        t.add("1", entry)
        notes.append("convert {{head|es|verb}} to {{es-verb|%s}}" % entry)
      else:
        notes.append("convert {{head|es|verb}} to {{es-verb}}")
    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
Пример #20
0
def process_page_for_fix(page, index, parsed):
  pagename = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagename, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)

  newtext = re.sub(r"\[\[(.*?)\]\]", r"{{l|kmr|\1}}", text)
  if newtext != text:
    notes.append("convert raw links to {{l|kmr|...}}")
    text = newtext

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn in ["l", "rhymes nav"] and getparam(t, "1") == "ku":
      t.add("1", "kmr")
      notes.append("convert {{%s|ku}} to {{%s|kmr}}" % (tn, tn))
    elif getparam(t, "1") == "ku":
      pagemsg("WARNING: Kurdish-language template of unrecognized name: %s" % unicode(t))
    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  text = unicode(parsed)

  return text, notes
Пример #21
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  global args
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  notes = []

  pagemsg("Processing")

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    newarg1 = None
    if tn == "de-conj":
      generate_template = re.sub(r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t))
      result = expand_text(generate_template)
      if not result:
        continue
      forms = blib.split_generate_args(result)
      pagemsg("For %s, class=%s" % (unicode(t), forms["class"]))

    if unicode(t) != origt:
      pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

  return unicode(parsed), notes
Пример #22
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None
    sections, j, secbody, sectail, has_non_latin = retval
    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in lalib.la_headword_templates:
            for head in lalib.la_get_headword_from_template(
                    t, pagetitle, pagemsg):
                no_macrons_head = remove_macrons(blib.remove_links(head))
                if pagetitle.startswith("Reconstruction"):
                    unprefixed_title = "*" + re.sub(".*/", "", pagetitle)
                else:
                    unprefixed_title = pagetitle
                if no_macrons_head != unprefixed_title:
                    pagemsg("WARNING: Bad Latin head: %s" % unicode(t))
    return None, None
Пример #23
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)

        def getp(param):
            return getparam(t, param)

        if tn in ["de-noun", "de-proper noun"]:
            auto_old = False
            for param in [
                    "old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2",
                    "gen3", "pl1", "pl2", "pl3"
            ]:
                if getp(param):
                    auto_old = True
                    break
            if not auto_old:
                t.add("old", "1")
                notes.append(
                    "add old=1 to {{%s}} because compatible with new signature"
                    % tn)

    return unicode(parsed), notes
 def replace_spenser_fq(m):
     template, text = m.groups()
     parsed = blib.parse_text(template)
     t = list(parsed.filter_templates())[0]
     par2 = getparam(t, "2")
     if par2:
         canto = arabic_to_roman(par2)
         if not canto:
             return m.group(0)
         t.add("canto", canto, before="2")
         rmparam(t, "2")
     par1 = getparam(t, "1")
     if par1:
         book = arabic_to_roman(par1)
         if not book:
             return m.group(0)
         t.add("book", book, before="1")
         rmparam(t, "1")
     text = re.sub(r"\s*<br */?>\s*", " / ", text)
     text = re.sub(r"^\{\{quote\|en\|(.*)\}\}$", r"\1", text)
     t.add("passage", text)
     blib.set_template_name(t, "RQ:Spenser Faerie Queene")
     notes.append(
         "reformat {{RQ:Spenser FQ}} into {{RQ:Spenser Faerie Queene}}")
     return unicode(t) + "\n"
Пример #25
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name in fr_head_templates:
      rmparam(t, "sort")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))
      notes.append("remove sort= from {{%s}}" % name)

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #26
0
 def undo_one_page_greek_removal(page, index, text):
   def pagemsg(txt):
     msg("Page %s %s: %s" % (index, unicode(page.title()), txt))
   template = blib.parse_text(template_text).filter_templates()[0]
   orig_template = unicode(template)
   if getparam(template, "sc") == "polytonic":
     template.remove("sc")
   to_template = unicode(template)
   param_value = getparam(template, removed_param)
   template.remove(removed_param)
   from_template = unicode(template)
   text = unicode(text)
   found_orig_template = orig_template in text
   newtext = text.replace(from_template, to_template)
   changelog = ""
   if newtext == text:
     if not found_orig_template:
       pagemsg("WARNING: Unable to locate 'from' template when undoing Greek param removal: %s"
           % from_template)
     else:
       pagemsg("Original template found, taking no action")
   else:
     if found_orig_template:
       pagemsg("WARNING: Undid removal, but original template %s already present!" %
           orig_template)
     if len(newtext) - len(text) != len(to_template) - len(from_template):
       pagemsg("WARNING: Length mismatch when undoing Greek param removal, may have matched multiple templates: from=%s, to=%s" % (
         from_template, to_template))
     changelog = "Undid removal of %s=%s in %s" % (removed_param,
         param_value, to_template)
     pagemsg("Change log = %s" % changelog)
   return newtext, changelog
def find_head_comp_sup(pagetitle, pagemsg):
    page = pywikibot.Page(site, pagetitle)
    text = unicode(page.text)
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        if tname(t) == "la-adv":
            head = getparam(t, "1")
            comp = getparam(t, "comp") or getparam(t, "2")
            sup = getparam(t, "sup") or getparam(t, "3")
            if not comp or not sup:
                for suff in [
                        "iter", "nter", "ter", "er", u"iē", u"ē", "im", u"ō"
                ]:
                    m = re.search("^(.*?)%s$" % suff, head)
                    if m:
                        stem = m.group(1)
                        if suff == "nter":
                            stem += "nt"
                        default_comp = stem + "ius"
                        default_sup = stem + u"issimē"
                        break
                else:
                    pagemsg(
                        "WARNING: Didn't recognize ending of adverb headword %s"
                        % head)
                    return head, comp, sup
                comp = comp or default_comp
                sup = sup or default_sup
            return head, comp, sup
    return None, None, None
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    text = unicode(page.text)
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn == "head" and getparam(t, "1") == "ang" and getparam(
                t, "2") in ["adjective", "adjectives"]:
            pagemsg("WARNING: {{head}} for adjectives, should not occur: %s" %
                    unicode(t))
        elif tn == "ang-adj":
            if getparam(t, "1"):
                pagemsg("WARNING: 1= in ang-adj, should not occur: %s" %
                        unicode(t))
            else:
                head = getparam(t, "head")
                rmparam(t, "head")
                if head:
                    t.add("1", head)
                notes.append("move head= to 1= in {{ang-adj}}")
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return parsed, notes
Пример #29
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    parsed = blib.parse_text(text)
    rhymes_templates = args.rhymes_templates.decode("utf-8").split(",")
    if args.skip_langs:
        skip_lang_codes = args.skip_langs.decode("utf-8").split(",")
    else:
        skip_lang_codes = []
    if args.include_langs:
        include_lang_codes = args.include_langs.decode("utf-8").split(",")
    else:
        include_lang_codes = []
    for t in parsed.filter_templates():
        if tname(t) in rhymes_templates:
            langcode = getparam(t, "1")
            if include_lang_codes and getparam(t,
                                               "1") not in include_lang_codes:
                continue
            if skip_lang_codes and langcode in skip_lang_codes:
                continue
            expanded = expand_text(unicode(t))
            if not expanded:
                continue
            for cattext in re.findall(r"\[\[Category:Rhymes:.*?\]\]",
                                      expanded):
                pagemsg("Found rhymes category: %s" % cattext[2:-2])
def process_page(index, page):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      found_headword_template = False
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname == "ru-adj" or (tname == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form"):
          found_headword_template = True
      if not found_headword_template and "===Adjective===" in sections[j]:
        pagemsg("WARNING: Missing adj headword template")
Пример #31
0
def get_pl_p_property(index, pagetitle):
  if pagetitle in pages_with_pl_p:
    return pages_with_pl_p[pagetitle]
  page = pywikibot.Page(site, pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  pagetext = blib.safe_page_text(page, pagemsg)
  parsed = blib.parse_text(pagetext)
  saw_pl_p = False
  respellings = []
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in ["pl-p", "pl-pronunciation"]:
      def getp(param):
        return getparam(t, param)
      saw_pl_p = True
      for pno in range(1, 11):
        respelling = getp(str(pno))
        if respelling and respelling not in respellings:
          respellings.append(respelling)
  if respellings:
    retval = ("pl-p-respelling", respellings)
  elif saw_pl_p:
    retval = ("pl-p-no-respelling", None)
  else:
    retval = ("no-pl-p", None)
  pages_with_pl_p[pagetitle] = retval
  return retval
Пример #32
0
def find_noun(pagename, pagemsg, errandpagemsg, expand_text):
  section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg)
  if not section:
    return None
  if "==Etymology" in section:
    return -1
  parsed = blib.parse_text(section)
  nouns = []
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun+":
      generate_template = re.sub(r"^\{\{ru-noun\+",
          "{{ru-generate-noun-forms", unicode(t))
      generate_result = expand_text(generate_template)
      if not generate_result:
        pagemsg("WARNING: Error generating noun forms")
        return None
      args = blib.split_generate_args(generate_result)
      lemma = args["nom_sg"] if "nom_sg" in args else args["nom_pl"]
      if "," in lemma:
        pagemsg("WARNING: Lemma has multiple forms: %s" % lemma)
        return None
      if lemma not in nouns:
        nouns.append(lemma)
  if len(nouns) > 1:
    pagemsg("WARNING: Multiple lemmas for noun: %s" % ",".join(nouns))
  if not nouns:
    return None
  return nouns[0]
 def replace_trans(m, newlangcode, newlangname):
     prefix, transtext = m.groups()
     parsed = blib.parse_text(transtext)
     for t in parsed.filter_templates():
         origt = unicode(t)
         tn = tname(t)
         if tn in trans_templates:
             if getparam(t, "1") == "ku":
                 t.add("1", newlangcode)
                 rmparam(t, "sc")
                 pagemsg(
                     "Replaced %s with %s based on language prefix of translation entry"
                     % (origt, unicode(t)))
                 notes.append(
                     "{{%s|ku}} -> {{%s|%s}} based on language prefix of translation entry"
                     % (tn, tn, newlangcode))
         elif tn == "t-simple":
             if getparam(t, "1") == "ku":
                 if getparam(t, "langname" != "Kurdish"):
                     pagemsg(
                         "WARNING: Something wrong, t-simple|ku without langname=Kurdish: %s"
                         % unicode(t))
                 else:
                     t.add("1", newlangcode)
                     t.add("langname", newlangname)
                     pagemsg("Replaced %s with %s based on prefix" %
                             (origt, unicode(t)))
                     notes.append(
                         "{{t-simple|ku|langname=Kurdish}} -> {{t-simple|%s|langname=%s}} based on language prefix"
                         % (newlangcode, newlangname))
     transtext = unicode(parsed)
     return prefix + transtext
Пример #34
0
def test_infer():
  class Page:
    def title(self):
      return "test_infer"
  for pagetext in test_templates:
    text = blib.parse_text(pagetext)
    page = Page()
    newtext, comment = infer_one_page_decls(page, 1, text)
    msg("newtext = %s" % unicode(newtext))
    msg("comment = %s" % comment)
Пример #35
0
 def get_form_class(k):
   formclass = None
   parsed = blib.parse_text(etymologies[j])
   for t in parsed.filter_templates():
     if t.name in ["ar-verb", "ar-verb-form"]:
       newformclass = getparam(t, "1")
       if formclass and newformclass and formclass != newformclass:
         pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass))
       formclass = newformclass
   return formclass
Пример #36
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    m = re.search("^==(.*?)==\n", sections[j-1])
    lang = m.group(1)
    parsed = blib.parse_text(sections[j])
    for t in parsed.filter_templates():
      if unicode(t.name) == "audio" and not getparam(t, "lang"):
        origt = unicode(t)
        if lang in langs_to_codes:
          langcode = langs_to_codes[lang]
        else:
          langcode = expand_text("{{#invoke:languages/templates|getByCanonicalName|%s|getCode}}" % lang)
          if not langcode:
            pagemsg("WARNING: Unable to find code for lang %s" % lang)
            continue
          langs_to_codes[lang] = langcode
        t.add("lang", langcode)
        newt = unicode(t)
        if origt != newt:
          pagemsg("Replaced %s with %s" % (origt, newt))
    sections[j] = unicode(parsed)

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))

    comment = "add lang code to audio templates"
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Remove gender from adjective forms
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form":
          origt = unicode(t)
          rmparam(t, "g")
          rmparam(t, "g2")
          rmparam(t, "g3")
          rmparam(t, "g4")
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("remove gender from adjective forms")
      sections[j] = unicode(parsed)
  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #38
0
 def combine_verbs(m):
   verb1 = m.group(1)
   verb2 = m.group(3)
   if m.group(2):
     pagemsg("WARNING: Would combine verbs but found text '%s' needing to go into a note, skipping: %s and %s" %
         (m.group(2), verb1, verb2))
     return m.group(0)
   t1 = blib.parse_text(verb1).filter_templates()[0]
   t2 = blib.parse_text(verb2).filter_templates()[0]
   for t in [t1, t2]:
     for param in t.params:
       if not re.search("^[0-9]+$", unicode(param.name)):
         pagemsg("Verb conjugation has non-numeric args, skipping: %s" %
             unicode(t))
         return m.group(0)
   params = fetch_numbered_params(t1)
   params.append("or")
   newparams = fetch_numbered_params(t2)
   if len(newparams) < 2:
     pagemsg("WARNING: Something wrong, no verb type in ru-conj: %s" %
         unicode(t2))
     return m.group(0)
   vt1 = getparam(t1, "1")
   vt2 = getparam(t2, "1")
   if vt1 != vt2:
     pagemsg("WARNING: Can't combine verbs of different verb types: %s and %s" %
         (verb1, verb2))
     return m.group(0)
   del newparams[0]
   params.extend(newparams)
   blib.set_param_chain(t1, params, "1", "")
   pagemsg("Combining verb conjugations %s and %s" % (
     getparam(t1, "1"), getparam(t2, "1")))
   pagemsg("Replaced %s with %s" % (m.group(0).replace("\n", r"\n"), unicode(t1)))
   notes.append("combined verb conjugations %s and %s" % (
     getparam(t1, "1"), getparam(t2, "1")))
   return unicode(t1)
Пример #39
0
def find_old_template_props(template, pagemsg, verbose):
  name = unicode(template.name)
  if name in cached_template_calls:
    template_text = cached_template_calls[name]
  else:
    template_page = pywikibot.Page(site, "Template:%s" % name)
    if not page.exists():
      pagemsg("WARNING: Can't locate template 'Template:%s'" % name)
      return None
    template_text = unicode(template_page.text)
    cached_template_calls[name] = template_text
  if verbose:
    pagemsg("Found template text: %s" % template_text)
  for t in blib.parse_text(template_text).filter_templates():
    tname = unicode(t.name).strip() # template name may have spaces
    if tname == "fr-conj" or tname == "#invoke:fr-conj" and getparam(t, "1").strip() == "frconj":
      args = {}
      # Yuck. Template param names sometimes have spaces in them; must strip.
      tparams = [(unicode(param.name.strip()), unicode(param.value.strip())) for param in t.params]
      tparamdict = dict(tparams)
      debug_args = []
      def sub_template(val):
        val = re.sub(r"\{\{\{1\|?\}\}\}", getparam(template, "1"), val)
        val = re.sub(r"\{\{\{2\|?\}\}\}", getparam(template, "2"), val)
        val = re.sub(r"\{\{\{pp\|(.*?)\}\}\}", lambda m:getparam(template, "pp") or m.group(1), val)
        return val
      for pname, pval in tparams:
        canonpname = re.sub(r"\.", "_", pname)
        if canonpname in all_verb_props:
          pval = sub_template(pval)
          pnamealt = pname + ".alt"
          pvalalt = tparamdict.get(pnamealt, "")
          pvalalt = sub_template(pvalalt)
          if pval in ["N/A", "-"]:
            pval = ""
          if pvalalt in ["N/A", "-"]:
            pvalalt = ""
          vals = [x for x in [pval, pvalalt] if x]
          pval = ",".join(vals)
          if pval and not re.search(r"—", pval):
            debug_args.append("%s=%s" % (canonpname, pval))
            args[canonpname] = pval
      pagemsg("Found args: %s" % "|".join(debug_args))
      return args
  pagemsg("WARNING: Can't find {{fr-conj}} in template definition for %s" %
      unicode(template))
  return None
Пример #40
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  origtext = page.text
  parsed = blib.parse_text(origtext)

  # Find the declension arguments for LEMMA and inflected form INFL,
  # the WORDINDth word in the expression. Return value is a tuple of
  # four items: a list of (NAME, VALUE) tuples for the arguments, whether
  # the word is an adjective, the value of n= (if given), and the value
  # of a= (if given).
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None

    # ru-noun-table
    assert unicode(decl_template.name) == "ru-noun-table"

    # Split out the arg sets in the declension and check the
    # lemma of each one, taking care to handle cases where there is no lemma
    # (it would default to the page name).

    highest_numbered_param = 0
    for p in decl_template.params:
      pname = unicode(p.name)
      if re.search("^[0-9]+$", pname):
        highest_numbered_param = max(highest_numbered_param, int(pname))

    # Now gather the numbered arguments into arg sets. Code taken from
    # ru-noun.lua.
    offset = 0
    arg_sets = []
    arg_set = []
    for i in xrange(1, highest_numbered_param + 2):
      end_arg_set = False
      val = getparam(decl_template, str(i))
      if i == highest_numbered_param + 1:
        end_arg_set = True
      elif val == "_" or val == "-" or re.search("^join:", val):
        pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
      elif val == "or":
        end_arg_set = True

      if end_arg_set:
        arg_sets.append(arg_set)
        arg_set = []
        offset = i
      else:
        arg_set.append(val)

    canon_infl = ru.remove_accents(infl).lower()
    canon_lemma = lemma.lower()
    ispl = False
    need_sc1 = False
    found_gender = None
    if canon_infl != canon_lemma:
      for sgend, plend, gender, is_sc1 in pl_data:
        if sgend:
          check_sgend = sgend
        else:
          check_sgend = consonant_re
        if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
          ispl = True
          found_gender = gender
          need_sc1 = is_sc1
          break
      else:
        pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    # Substitute the wordlink for any lemmas in the declension.
    # If plural, also add gender and verify special case (1) as necessary.
    # Concatenate all the numbered params, substituting the wordlink into
    # the lemma as necessary.
    numbered_params = []
    for arg_set in arg_sets:
      lemma_arg = 0
      if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]):
        lemma_arg = 1
      if len(arg_set) <= lemma_arg:
        arg_set.append("")
      arglemma = arg_set[lemma_arg]
      manualtr = ""
      if "//" in arglemma:
        arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups()
      if (not arglemma or arglemma.lower() == infl.lower() or
          ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() ==
          ru.remove_accents(infl).lower() or
          ispl and ru.remove_accents(arglemma).lower() == lemma.lower()
          ):
        arg_set[lemma_arg] = wordlink + manualtr
      else:
        pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % (
          wordlink, arg_set[lemma_arg], ispl and ", skipping" or ""))
        if ispl:
          return None

      if ispl:
        # Add the gender
        if len(arg_set) <= lemma_arg + 1:
          arg_set.append("")
        declarg = arg_set[lemma_arg + 1]

        # First, sub in gender
        m = re.search("(3f|[mfn])", declarg)
        if found_gender == "mf":
          if not m:
            pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          decl_gender = m.group(1)
          if decl_gender == "n":
            pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          elif decl_gender in ["m", "3f"]:
            pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                (decl_gender, wordind, lemma, infl))
          else:
            assert gender == "f"
            pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" %
                (wordind, lemma, infl))
            declarg = re.sub("f", "3f", declarg, 1)
        else:
          if m:
            decl_gender = m.group(1)
            if decl_gender == found_gender:
              pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                  (found_gender, wordind, lemma, infl))
            else:
              pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" %
                  (decl_gender, wordind, found_gender, lemma, infl))
              declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1)
          else:
            pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" %
                (wordind, found_gender, lemma, infl))
            declarg = found_gender + declarg

        # Now check special case 1
        if need_sc1 != ("(1)" in declarg):
          if need_sc1:
            pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None
          else:
            pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None

        arg_set[lemma_arg + 1] = declarg

      if numbered_params:
        numbered_params.append("or")
      numbered_params.extend(arg_set)

    # Now gather all params, including named ones.
    params = []
    params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params))
    num = None
    anim = None
    for p in decl_template.params:
      pname = unicode(p.name)
      val = unicode(p.value)
      if pname == "a":
        anim = val
      elif pname == "n":
        num = val
      elif pname == "notes":
        params.append((pname, val))
      elif pname == "title":
        pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" %
            (wordind, lemma, infl, val))
      elif re.search("^[0-9]+$", pname):
        pass
      else:
        keepparam = True
        if pname == "loc":
          if pagetitle in keep_locative:
            pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          else:
            pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
            keepparam = False
        if pname == "par":
          pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if pname == "voc":
          pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if keepparam:
          if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
            pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" %
                (wordind, val, lemma, infl))
          pname += str(wordind)
          params.append((pname, val))
Пример #41
0
      if plval and plval != "-":
        if overall_num != "both":
          pagemsg("WARNING: Proper noun is apparently sg/pl but main noun not, skipping: %s" %
              headword)
          return
      elif overall_num == "both":
        pagemsg("WARNING: Proper noun has sg/pl main noun underlying it, assuming singular: %s" %
            headword)
        overall_num = None
      elif overall_num == "sg":
        overall_num = None
    if overall_num:
      params.append(("n", overall_num))

  generate_template = (
      blib.parse_text("{{ru-generate-noun-args}}").filter_templates()[0])
  for name, value in params:
    generate_template.add(name, value)
  proposed_template_text = unicode(generate_template)
  if headword_is_proper:
    proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args",
        "{{ru-proper noun+", proposed_template_text)
  else:
    proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args",
        "{{ru-noun+", proposed_template_text)
  proposed_decl = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in generate_template.params:
    proposed_decl.add(param.name, param.value)

  def pagemsg_with_proposed(text):
    pagemsg("Proposed new template (WARNING, omits explicit gender and params to preserve from old template): %s" % proposed_template_text)
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = ru.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword and (not lemmas or pagetitle in lemmas):
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runoun.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runoun.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Пример #43
0
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None
Пример #44
0
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, []

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  frobbed_manual_translit = []
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  # Retrieve headword translit and maybe transfer to decl
  headword_tr = getparam(headword_template, "tr")
  if headword_tr:
    if verbose:
      pagemsg("Found headword manual translit tr=%s" % headword_tr)
    if "," in headword_tr:
      pagemsg("WARNING: Comma in headword manual translit, skipping: %s" %
          headword_tr)
      return None
    # Punt if multi-arg-set, can't handle yet
    for decl_template in decl_templates:
      for param in decl_template.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
      for i in xrange(2, 10):
        if getparam(headword_template, "tr%s" % i):
          pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (
            i, unicode(headword_template)))
          return None
      if runoun.arg1_is_stress(getparam(decl_template, "1")):
        lemma_arg = "2"
      else:
        lemma_arg = "1"
      lemmaval = getparam(decl_template, lemma_arg)
      if not lemmaval:
        lemmaval = subpagetitle
      if "//" in lemmaval:
        m = re.search("^(.*?)//(.*)$", lemmaval)
        if m.group(2) != headword_tr:
          pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (
            lemmaval, headword_tr))
          return None
        else:
          pagemsg("Already found manual translit in decl template %s" %
              lemmaval)
      else:
        lemmaval += "//" + headword_tr
        orig_decl_template = unicode(decl_template)
        decl_template.add(lemma_arg, lemmaval)
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))
        frobbed_manual_translit = [headword_tr]

  genders = blib.fetch_param_chain(headword_template, "2", "g")

  bian_replaced = 0

  # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
  # headword template
  for decl_template in decl_templates:
    if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
      saw_in = -1
      saw_an = -1
      for i,g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
          saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
          saw_an = i
      if saw_in >= 0 and saw_an >= 0:
        orig_decl_template = unicode(decl_template)
        if saw_in < saw_an:
          pagemsg("Replacing a=bi with a=ia in decl template")
          decl_template.add("a", "ia")
          bian_replaced = 1
        else:
          pagemsg("Replacing a=bi with a=ai in decl template")
          decl_template.add("a", "ai")
          bian_replaced = 1
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))

  generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
      unicode(noun_table_template))
  generate_result = expand_text(generate_template)
  if not generate_result:
    pagemsg("WARNING: Error generating noun args, skipping")
    return None
  args = ru.split_generate_args(generate_result)

  genders = runoun.check_old_noun_headword_forms(headword_template, args,
      subpagetitle, pagemsg)
  if genders == None:
    return None

  new_params = []
  for param in noun_table_template.params:
    new_params.append((param.name, param.value))

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      new_params, genders, pagemsg)
  if params_to_preserve == None:
    return None

  if unicode(headword_template.name) == "ru-proper noun":
    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(headword_template, "n"):
      pagemsg("Adding n=both to headword tempate")
      headword_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        else:
          pagemsg("Removing n=sg from headword tempate")
          rmparam(headword_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  headword_template.params.extend(params_to_preserve)
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    ru_noun_changed = 1
  else:
    headword_template.name = "ru-proper noun+"
    ru_proper_noun_changed = 1

  pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template)))

  return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
def process_page(index, page, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  origtext = page.text
  parsed = blib.parse_text(origtext)

  def check_lemma(lemma):
    if lemma in lemma_count:
      lemma_count[lemma] += 1
      if lemma in nonexistent_lemmas:
        nonexistent_lemmas_refs[lemma].append(pagetitle)
    else:
      lemma_count[lemma] = 1
      if lemma not in lemmas:
        page = pywikibot.Page(site, lemma)
        try:
          exists = page.exists()
        except pywikibot.exceptions.InvalidTitle as e:
          pagemsg("WARNING: Invalid title: %s" % lemma)
          traceback.print_exc(file=sys.stdout)
          exists = False
        if exists:
          if re.search("#redirect", unicode(page.text), re.I):
            nonexistent_msg = "exists as redirect"
          elif re.search(r"\{\{superlative of", unicode(page.text)):
            nonexistent_msg = "exists as superlative"
          else:
            nonexistent_msg = "exists as non-lemma"
        else:
          nonexistent_msg = "does not exist"
        pagemsg("Referenced lemma %s: %s" % (lemma, nonexistent_msg))
        nonexistent_lemmas[lemma] = nonexistent_msg
        nonexistent_lemmas_refs[lemma] = [pagetitle]

  def process_arg_set(arg_set):
    if not arg_set:
      return
    offset = 0
    if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]):
      offset = 1
    if len(arg_set) <= offset:
      return
    # Remove * meaning non-stressed
    lemma = re.sub(r"^\*", "", arg_set[offset])
    # Remove translit
    lemma = re.sub("//.*$", "", lemma)
    if not lemma:
      return
    headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma)
    if headwords_separators[0] != "" or headwords_separators[-1] != "":
      pagemsg("WARNING: Found junk at beginning or end of headword, skipping: %s" % lemma)
      return
    wordind = 0
    for i in xrange(1, len(headwords_separators), 2):
      hword = headwords_separators[i]
      separator = headwords_separators[i+1]
      if i < len(headwords_separators) - 2 and separator != " " and separator != "-":
        pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" %
            (wordind + 1, hword, separator))
        continue
      hword = hword.replace("#Russian", "")
      hword = rulib.remove_accents(blib.remove_right_side_links(hword))
      check_lemma(hword)
      wordind += 1

  def process_new_style_headword(htemp):
    # Split out the arg sets in the declension and check the
    # lemma of each one, taking care to handle cases where there is no lemma
    # (it would default to the page name).

    highest_numbered_param = 0
    for p in htemp.params:
      pname = unicode(p.name)
      if re.search("^[0-9]+$", pname):
        highest_numbered_param = max(highest_numbered_param, int(pname))

    # Now split based on arg sets.
    arg_set = []
    for i in xrange(1, highest_numbered_param + 2):
      end_arg_set = False
      val = getparam(htemp, str(i))
      if (i == highest_numbered_param + 1 or val in ["or", "_", "-"] or
          re.search("^join:", val)):
        end_arg_set = True

      if end_arg_set:
        process_arg_set(arg_set)
        arg_set = []
      else:
        arg_set.append(val)

  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-decl-noun-see":
      pagemsg("WARNING: Skipping ru-decl-noun-see, can't handle yet: %s" % unicode(t))
    elif tname in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found %s" % unicode(t))
      process_new_style_headword(t)
    elif tname in ["ru-noun", "ru-proper noun"]:
      pagemsg("WARNING: Skipping ru-noun or ru-proper noun, can't handle yet: %s" % unicode(t))
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Remove blank form codes and canonicalize position of lang=, tr=
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          # Fetch the numbered params starting with 3, skipping blank ones
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          # Fetch param 1 and param 2, and non-numbered params except lang=
          # and nocat=.
          param1 = getparam(t, "1")
          param2 = getparam(t, "2")
          tr = getparam(t, "tr")
          nocat = getparam(t, "nocat")
          non_numbered_params = []
          for param in t.params:
            pname = unicode(param.name)
            if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
              non_numbered_params.append((pname, param.value))
          # Erase all params.
          del t.params[:]
          # Put back lang, param 1, tr, param 2, then the replacements for the
          # higher numbered params, then the non-numbered params.
          t.add("lang", "ru")
          t.add("1", param1)
          if tr:
            t.add("tr", tr)
          t.add("2", param2)
          for i, param in enumerate(numbered_params):
            t.add(str(i+3), param)
          for name, value in non_numbered_params:
            t.add(name, value)
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("removed any blank form codes and maybe rearranged lang=, tr=")
            if nocat:
              notes.append("removed nocat=")
      sections[j] = unicode(parsed)

      # Convert 'prep' to 'pre', etc.
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          for frm, to in [
              ("nominative", "nom"), ("accusative", "acc"),
              ("genitive", "gen"), ("dative", "dat"),
              ("instrumental", "ins"),
              ("prep", "pre"), ("prepositional", "pre"),
              ("vocative", "voc"), ("locative", "loc"), ("partitive", "par"),
              ("singular", "s"), ("(singular)", "s"),
              ("plural", "p"), ("(plural)", "p"),
              ("inanimate", "in"), ("animate", "an"),
              ]:
            origt = unicode(t)
            for i in xrange(3,20):
              val = getparam(t, str(i))
              if val == frm:
                t.add(str(i), to)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s' form code to '%s'" % (frm, to))
      sections[j] = unicode(parsed)

      # Rearrange order of s|gen, p|nom etc. to gen|s, nom|p etc.
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          if (getparam(t, "3") in ["s", "p"] and
              getparam(t, "4") in ["nom", "gen", "dat", "acc", "ins", "pre", "voc", "loc", "par"] and
              not getparam(t, "5")):
            origt = unicode(t)
            number = getparam(t, "3")
            case = getparam(t, "4")
            t.add("3", case)
            t.add("4", number)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s|%s' to '%s|%s'" %
                  (number, case, case, number))
      sections[j] = unicode(parsed)

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #47
0
def process_page(index, page, save, verbose, fix_missing_plurals):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name == "head" and getparam(t, "1") == "fr":
      headtype = getparam(t, "2")
      fixed_plural_warning = False
      if headtype == "noun":
        head = getparam(t, "head")
        g = getparam(t, "g")
        g2 = getparam(t, "g2")
        plural = ""
        if getparam(t, "3") == "plural":
          plural = getparam(t, "4")
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "g", "g2", "sort"] or plural and pname in ["3", "4"]:
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        if not g:
          pagemsg("WARNING: No gender given in %s, skipping" % unicode(t))
          continue
        found_feminine_noun = False
        if g == "f" and not g2 and not plural:
          for tt in parsed.filter_templates():
            if (unicode(tt.name) == "feminine noun of" and
                getparam(tt, "lang") == "fr"):
              found_feminine_noun = True
        if found_feminine_noun:
          pagemsg("Found 'feminine noun of', assuming countable")
        elif g not in ["m-p", "f-p"] and not plural:
          if fix_missing_plurals:
            pagemsg("WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW"
                % unicode(t))
            fixed_plural_warning = True
          else:
            pagemsg("WARNING: No plural given in %s, skipping" % unicode(t))
            continue
        rmparam(t, "4")
        rmparam(t, "3")
        rmparam(t, "2")
        rmparam(t, "1")
        rmparam(t, "head")
        rmparam(t, "g")
        rmparam(t, "g2")
        rmparam(t, "sort")
        t.name = "fr-noun"
        if head:
          t.add("head", head)
        t.add("1", g)
        if g2:
          t.add("g2", g2)
        if plural:
          t.add("2", plural)
      elif headtype in ["proper noun", "proper nouns"]:
        head = getparam(t, "head")
        g = getparam(t, "g")
        g2 = getparam(t, "g2")
        remove_3 = False
        if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]:
          g = getparam(t, "3")
          remove_3 = True
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "g", "g2", "sort"] or remove_3 and pname in ["3"]:
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        if not g:
          pagemsg("WARNING: No gender given in %s, skipping" % unicode(t))
          continue
        rmparam(t, "3")
        rmparam(t, "2")
        rmparam(t, "1")
        rmparam(t, "head")
        rmparam(t, "g")
        rmparam(t, "g2")
        rmparam(t, "sort")
        t.name = "fr-proper noun"
        if head:
          t.add("head", head)
        t.add("1", g)
        if g2:
          t.add("g2", g2)
      elif headtype in ["adjective", "adjectives"]:
        if getparam(t, "3") in ["invariable", "invariant"]:
          params = dict((unicode(p.name), unicode(p.value)) for p in t.params)
          del params["1"]
          del params["2"]
          del params["3"]
          if getparam(t, "g") == "m" and getparam(t, "g2") == "f":
            del params["g"]
            del params["g2"]
          if not params:
            rmparam(t, "g2")
            rmparam(t, "g")
            rmparam(t, "3")
            rmparam(t, "2")
            rmparam(t, "1")
            t.name = "fr-adj"
            t.add("inv", "y")
          else:
            pagemsg("WARNING: Unrecognized parameters in %s, skipping" %
                unicode(t))
        else:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping" %
              unicode(t))
      elif headtype in ["adjective form", "verb form", "verb forms",
          "interjection", "preposition", "prefix", "prefixes",
          "suffix", "suffixes"]:
        headtype_supports_g = headtype in [
            "adjective form", "suffix", "suffixes"]
        head = getparam(t, "head")
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "sort"] or headtype_supports_g and pname == "g":
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        rmparam(t, "sort")
        rmparam(t, "head")
        rmparam(t, "2")
        rmparam(t, "1")
        t.name = ("fr-adj-form" if headtype == "adjective form" else
            "fr-verb-form" if headtype in ["verb form", "verb forms"] else
            "fr-intj" if headtype == "interjection" else
            "fr-prep" if headtype == "preposition" else
            "fr-prefix" if headtype in ["prefix", "prefixes"] else
            "fr-suffix" # if headtype in ["suffix", "suffixes"]
            )
        if head:
          t.add("head", head)

      newt = unicode(t)
      if origt != newt:
        pagemsg("Replacing %s with %s" % (origt, newt))
        notes.append("replaced {{head|fr|%s}} with {{%s}}%s" % (headtype,
          unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else ""))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg,
                    "save page")
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #48
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []
  already_canonicalized = False
  found_short_inflection_of = False
  warned_about_short = False

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'inflection of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            numbered_params.append(getparam(t, str(i)))
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          canon_params = []
          while True:
            m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", m.group(1), "s"]
              break
            m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", "p"]
              break
            if "short" in numbered_params or "short form" in numbered_params:
              found_short_inflection_of = True
              warned_about_short = True
              pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" %
                  unicode(t))
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
              already_canonicalized = True
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified singular inflection
      def add_sing_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for short/%s/s" % gender)
        return retval
      newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of,
          sections[j], 0, re.I)
      if newsec != sections[j]:
        found_short_inflection_of = True
      sections[j] = newsec

      if "short" in sections[j] and not found_short_inflection_of:
        m = re.search("^(.*short.*)$", sections[j], re.M)
        warned_about_short = True
        pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" %
            (m and m.group(1) or "Can't get line?"))

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

  if not notes and not already_canonicalized:
    pagemsg("Skipping, no short form found%s" % (
      warned_about_short and " (warning issued)" or " (no warning)"))
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  def check_bad_head(text, arg):
    canontext = re.sub(u"[׳’]", "'", blib.remove_links(text))
    canonpagetitle = re.sub(u"[׳’]", "'", pagetitle)
    if canontext != canonpagetitle:
      pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" %
          (arg, canontext, canonpagetitle, arg, text))

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name in fr_head_templates:
      head = getparam(t, "head")
      if head:
        linked_pagetitle = link_text(pagetitle)
        linked_head = link_text(head)
        if linked_pagetitle == linked_head:
          pagemsg("Removing redundant head=%s" % head)
          rmparam(t, "head")
          notes.append("remove redundant head= from {{%s}}" % name)
        else:
          pagemsg("Not removing non-redundant head=%s" % head)
          check_bad_head(head, "head")
    if name in fr_head_or_1_templates:
      head = getparam(t, "1")
      if head:
        linked_pagetitle = link_text(pagetitle)
        linked_head = link_text(head)
        if linked_pagetitle == linked_head:
          pagemsg("Removing redundant 1=%s" % head)
          rmparam(t, "1")
          notes.append("remove redundant 1= from {{%s}}" % name)
        else:
          pagemsg("Not removing non-redundant 1=%s" % head)
          check_bad_head(head, "1")

    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #50
0
def convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg,
    headword_template=None):
  zdecl = unicode(decl_z_template)
  zdeclcopy = blib.parse_text(zdecl).filter_templates()[0]
  decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
  # {{ru-decl-noun-z|ёж|m-inan|b}}
  def getp(param):
    rmparam(zdeclcopy, param)
    return getparam(decl_z_template, param).strip()
  zlemma = getp("1")
  zgender_anim = getp("2")
  zstress = getp("3")
  zspecial = re.sub(u"ё", u";ё", getp("4"))
  m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim)
  if not m:
    pagemsg("WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s" %
        zgender_anim)
    return None
  zgender, zanim = m.groups()

  if not zlemma:
    pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl)
    return None

  # Remove unnecessary gender
  need_gender = (re.search(u"[иы]́?$", zlemma) or
      zgender == "n" and re.search(u"[яа]́?$", zlemma) or
      zgender == "m" and re.search(u"[яа]́?$", zlemma) and "(1)" in zspecial or
      zlemma.endswith(u"ь"))
  if not need_gender:
    normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n" or
        re.search(u"[ая]́?$", zlemma) and "f" or "m")
    if normal_gender != zgender:
      pagemsg("WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender" %
          (normal_gender, zgender))
      need_gender = True
  if need_gender:
    pagemsg("Preserving gender in z-decl: %s" % zdecl)
    zspecial = zgender + zspecial
  else:
    pagemsg("Not preserving gender in z-decl: %s" % zdecl)

  # Remove unnecessary stress
  stressed_lemma = ru.try_to_stress(zlemma)
  def check_defstress(defstr, reason):
    if defstr == zstress:
      pagemsg("Removing stress %s as default because %s: stressed_lemma=%s, template=%s" %
          (defstr, reason, stressed_lemma, zdecl))
    return defstr
  if ru.is_nonsyllabic(stressed_lemma):
    default_stress = check_defstress("b", "nonsyllabic lemma")
  elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma):
    default_stress = check_defstress("b", "ending-accented lemma")
  # No need for special-casing for ёнок or а́нин, as they are considered
  # accent a by ru-decl-noun-z
  else:
    default_stress = check_defstress("a", "stem-accented lemma")
  if default_stress == zstress:
    zstress = ""
  else:
    pagemsg("Not removing stress %s: %s" % (zstress, zdecl))

  # Remove unnecessary lemma
  if ru.try_to_stress(subpagetitle) == stressed_lemma:
    pagemsg(u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s" %
        (zlemma, subpagetitle, zdecl))
    zlemma = ""

  if zstress:
    decl_template.add("1", zstress)
    offset = 1
  else:
    offset = 0
  decl_template.add(str(1 + offset), zlemma)
  decl_template.add(str(2 + offset), zspecial)
  if not getparam(decl_template, "3"):
    rmparam(decl_template, "3")
    if not getparam(decl_template, "2"):
      rmparam(decl_template, "2")
      if not getparam(decl_template, "1"):
        rmparam(decl_template, "1")

  headword_anim_spec = headword_template and extract_headword_anim_spec(headword_template)
  def anim_mismatch(zdecl_an, allowed_headword_ans):
    if headword_anim_spec and headword_anim_spec not in allowed_headword_ans:
      pagemsg("WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s" %
          (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans),
            zdecl, unicode(headword_template)))

  if zanim == "an":
    anim_mismatch(zanim, ["an"])
    pagemsg("Preserving z-decl -an as a=an: %s" % zdecl)
    decl_template.add("a", "an")
  elif zanim == "inan":
    anim_mismatch(zanim, ["ai", "ia"])
    if headword_anim_spec in ["ai", "ia"]:
      pagemsg("Converting z-decl -inan to a=%s: %s" %
          (headword_anim_spec, zdecl))
      decl_template.add("a", headword_anim_spec)
    else:
      pagemsg("WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s" %
          (zdecl, unicode(headword_template or "(no headword)")))
      decl_template.add("a", "bi")
  else:
    assert(zanim == "in")
    anim_mismatch(zanim, ["in"])
    pagemsg("Dropping z-decl -in as default: %s" % zdecl)

  znum = getp("n")
  if znum:
    if znum == "pl":
      pagemsg("WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s" %
          zdecl)
    pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl))
    decl_template.add("n", znum)

  preserve_params = [
    'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg',
    'nom_pl', 'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl',
    'voc'
  ]
  renamed_params = {'prp_sg':'pre_sg', 'prp_pl':'pre_pl'}

  for param in preserve_params:
    val = getp(param)
    if not val:
      continue
    newval = fixup_link(val)
    newvals = re.split(r"\s*,\s*", newval)
    newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals]
    newval= ",".join(newvals)
    newparam = renamed_params.get(param, param)
    pagemsg("Preserving z-decl override %s=%s%s%s: %s" % (
      newparam, newval,
      "" if newparam == param else "; renamed from %s" % param,
      "" if newval == val else "; canonicalized from %s=%s" % (param, val),
      zdecl))
    decl_template.add(newparam, newval)
  loc = getp("loc")
  if loc:
    if loc == u"в":
      newloc = u"в +"
    elif loc == u"на":
      newloc = u"на +"
    else:
      newloc = u"в/на +"
    pagemsg("Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s" %
        (newloc, loc, zdecl))
    decl_template.add("loc", newloc)
  par = getp("par")
  if par:
    newpar="+"
    pagemsg("Preserving z-decl partitive par=%s (canonicalized from par=%s): %s" %
        (newpar, par, zdecl))
    decl_template.add('par', newpar)
  notes = getp("note")
  if notes:
    pagemsg("WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s" %
        (notes, zdecl))
    decl_template.add('notes', notes)

  if zdeclcopy.params:
    pagemsg("WARNING: Extraneous params in z-decl: %s" % unicode(zdeclcopy))

  #pagemsg("Replacing z-decl %s with regular decl %s" %
  #    (zdecl, unicode(decl_template)))
  return decl_template
Пример #51
0
def split_one_page_etymologies(page, index, pagetext, verbose):

  # Fetch pagename, create pagemsg() fn to output msg with page name included
  pagename = page.title()
  pagetext = unicode(pagetext)
  def pagemsg(text):
    msg("Page %s %s: %s" % (index, pagename, text))

  comment = None
  notes = []

  # Split off interwiki links at end
  m = re.match(r"^(.*?\n)(\n*(\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
      pagetext, re.S)
  if m:
    pagebody = m.group(1)
    pagetail = m.group(2)
  else:
    pagebody = pagetext
    pagetail = ""

  # Split into sections
  splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M)
  # Extract off pagehead and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Go through each section in turn, looking for existing Arabic section
  for i in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[i], re.M)
    if not m:
      pagemsg("WARNING: Can't find language name in text: [[%s]]" % (sections[i]))
    elif m.group(1) == "Arabic":
      # Extract off trailing separator
      mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S)
      if mm:
        sections[i:i+1] = [mm.group(1), mm.group(2)]
      elif i < len(sections) - 1:
        pagemsg("WARNING: Arabic language section %s is non-final and missing trailing separator" % i)

      for mm in re.finditer("^(==+)[^=\n](==+)$", sections[i], re.M):
        if mm.group(1) != mm.group(2):
          pagemsg("WARNING: Malconstructed header: %s" % mm.group(0))

      subsections = re.split("(^===[^=\n]+=+\n)", sections[i], 0, re.M)
      if len(subsections) < 2:
        pagemsg("WARNING: Page missing any entries")

      etymologies = []
      etymsections = []
      sechead = subsections[0]
      if "\n===Etymology 1=" in sections[i]:
        etyms_were_separate = True
        for j in xrange(1, len(subsections), 2):
          if not re.match("^===Etymology [0-9]+=", subsections[j]):
            pagemsg("WARNING: Non-etymology level-3 header when split etymologies: %s" % subsections[j][0:-1])
        etymsections = [subsections[j] for j in xrange(2, len(subsections), 2)]
        # Reduce indent by one. We will increase it again when we split
        # etymologies.
        for j in xrange(len(etymsections)):
          etymsections[j] = re.sub("^==", "=", etymsections[j], 0, re.M)
      else:
        etyms_were_separate = False
        etymsections = ''.join(subsections[1:])

      for etymsection in etymsections:
        subsections = re.split("(^===[^=\n]+=+\n)", etymsection, 0, re.M)
        if len(subsections) < 2:
          pagemsg("WARNING: Section missing any entries")
        split_sections = []
        next_split_section = 0
        def append_section(k):
          while len(split_sections) <= next_split_section:
            split_sections.append("")
          split_sections[next_split_section] += \
              subsections[k] + subsections[k + 1]

        last_lemma = None
        last_inflection_of_lemma = None
        for j in xrange(1, len(subsections), 2):
          if re.match("^===+(References|Related|See)", subsections[j]):
            pagemsg("Found level-3 section that should maybe be at higher level: %s" % subsections[j][0:-1])
            append_section(j)
          elif re.match("^===+(Alternative|Etymology)", subsections[j]):
            append_section(j)
          else:
            parsed = blib.parse_text(subsections[j + 1])
            lemma = None
            inflection_of_lemma = None
            for t in parsed.filter_templates():
              if t.name in arabic_all_headword_templates:
                if lemma:
                  if t.name not in ["ar-nisba", "ar-noun-nisba", "ar-verb",
                      "ar-verb-form"]:
                    pagemsg("Found multiple headword templates in section %s: %s" % (j, subsections[j][0:-1]))
                # Note: For verbs this is the form class, which we match on
                lemma = reorder_shadda(remove_links(getparam(t, "1")))
              if t.name == "inflection of":
                if inflection_of_lemma:
                  pagemsg("Found multiple 'inflection of' templates in section %s: %s" % (j, subsections[j][0:-1]))
                inflection_of_lemma = remove_diacritics(
                    remove_links(getparam(t, "1")))
            if not lemma:
              pagemsg("Warning: No headword template in section %s: %s" % (j, subsections[j][0:-1]))
              append_section(j)
            else:
              if lemma != last_lemma:
                next_split_section += 1
              elif (inflection_of_lemma and last_inflection_of_lemma and
                  inflection_of_lemma != last_inflection_of_lemma):
                pagemsg("Verb forms have different inflection-of lemmas %s and %s, splitting etym" % (
                  last_inflection_of_lemma, inflection_of_lemma))
                next_split_section += 1
              last_lemma = lemma
              last_inflection_of_lemma = inflection_of_lemma
              append_section(j)
        etymologies += split_sections

      # Combine adjacent etymologies with same verb form class I.
      # FIXME: We might not want to do this; the etymologies might be
      # legitimately split. Need to check each case.
      j = 0
      while j < len(etymologies) - 1:
        def get_form_class(k):
          formclass = None
          parsed = blib.parse_text(etymologies[j])
          for t in parsed.filter_templates():
            if t.name in ["ar-verb", "ar-verb-form"]:
              newformclass = getparam(t, "1")
              if formclass and newformclass and formclass != newformclass:
                pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass))
              formclass = newformclass
          return formclass

        formclassj = get_form_class(j)
        formclassj1 = get_form_class(j + 1)
        if formclassj == "I" and formclassj1 == "I":
          if not etymologies[j + 1].startswith("="):
            pagemsg("WARNING: Can't combine etymologies with same verb form class because second has etymology text")
          else:
            pagemsg("Combining etymologies with same verb form class I")
            etymologies[j] = etymologies[j].rstrip() + "\n\n" + etymologies[j + 1]
            # Cancel out effect of incrementing j below since we combined
            # the following etymology into this one
            j -= 1
        j += 1

      if len(etymologies) > 1:
        for j in xrange(len(etymologies)):
          # Stuff like "===Alternative forms===" that goes before the
          # etymology section should be moved after.
          newetymj = re.sub(r"^(.*?\n)(===Etymology===\n(\n|[^=\n].*?\n)*)",
              r"\2\1", etymologies[j], 0, re.S)
          if newetymj != etymologies[j]:
            pagemsg("Moved ===Alternative forms=== and such after Etymology")
            etymologies[j] = newetymj
          # Remove ===Etymology=== from beginning
          etymologies[j] = re.sub("^===Etymology===\n", "", etymologies[j])
          # Fix up newlines around etymology section
          etymologies[j] = etyomologies[j].strip() + "\n\n"
          if etymologies[j].startswith("="):
            etymologies[j] = "\n" + etymologies[j]
        sections[i] = (sechead +
            ''.join(["===Etymology %s===\n" % (j + 1) + etymologies[j]
              for j in xrange(len(etymologies))]))
      elif len(etymologies) == 1:
        if etyms_were_separate:
          # We might need to add an Etymology header at the beginning.
          pagemsg("Combined formerly separate etymologies")
          if not re.match(r"^(=|\{\{wikipedia|\[\[File:)",
              etymologies[0].strip()):
            etymologies[0] = "===Etymology===\n" + etymologies[0]
            pagemsg("Added Etymology header when previously separate etymologies combined")
          # Put Alternative forms section before Etymology.
          newetym0 = re.sub(r"^((?:\n|[^=\n].*?\n)*)(===Etymology===\n(?:\n|[^=\n].*?\n)*)(===(Alternative.*?)===\n(?:\n|[^=\n].*?\n)*)",
              r"\1\3\2", etymologies[0], 0, re.S)
          if newetym0 != etymologies[0]:
            pagemsg("Moved ===Alternative forms=== and such before Etymology")
            etymologies[0] = newetym0

        sections[i] = sechead + etymologies[0]
      else:
        sections[i] = sechead

      break

  # End of loop over sections in existing page; rejoin sections
  newtext = pagehead + ''.join(sections) + pagetail

  # Don't signal a save if only differences are whitespace at end,
  # since it appears that newlines at end get stripped when saving.
  if pagetext.rstrip() == newtext.rstrip():
    pagemsg("No change in text")
  else:
    if verbose:
      pagemsg("Replacing [[%s]] with [[%s]]" % (pagetext, newtext))
    else:
      pagemsg("Text has changed")
    pagetext = newtext

    # Construct and output comment.
    notestext = '; '.join(notes)
    if notestext:
      if comment:
        comment += " (%s)" % notestext
      else:
        comment = notestext
    assert(comment)
    pagemsg("comment = %s" % comment, simple = True)

  return pagetext, comment
Пример #52
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    name = unicode(t.name)
    if name in templates_to_change or name in refl_templates_to_change:
      refl = name in refl_templates_to_change
      difvals = compare_conjugation(index, page, t, refl, pagemsg, expand_text, verbose)
      if difvals is None:
        pass
      elif difvals:
        difprops = []
        for prop, (oldval, newval) in difvals:
          difprops.append("%s=%s vs. %s" % (prop, oldval or "(missing)", newval or "(missing)"))
        pagemsg("WARNING: Different conjugation when changing template %s to {{fr-conj-auto}}: %s" %
            (unicode(t), "; ".join(difprops)))
      else:
        aux = ""
        for param in t.params:
          pname = unicode(param.name)
          pval = unicode(param.value)
          if not pval.strip():
            continue
          if (pname not in ["1", "2", "3", "aux", "sort", "cat"] or
              pname == "3" and pval not in ["avoir", "être", "avoir or être"]):
            pagemsg("WARNING: Found extra param %s=%s in %s" %
                (pname, pval, unicode(t)))
          if pname == "aux" and pval != "avoir":
            aux = pval
            pagemsg("Found non-avoir auxiliary aux=%s in %s" % (
              pval, unicode(t)))
          auxpname = ("3" if name in ["fr-conj-e-er", "fr-conj-ir (s)"] else
              "aux" if name in ["fr-conj-xx-er", "fr-conj-é-er"] else "2")
          if pname == auxpname and pval != "avoir":
            aux = pval
            pagemsg("Found non-avoir auxiliary %s=%s in %s" % (
              pname, pval, unicode(t)))
        oldt = unicode(t)
        del t.params[:]
        t.name = "fr-conj-auto"
        if refl:
          t.add("refl", "yes")
        if aux:
          t.add("aux", aux)
        newt = unicode(t)
        pagemsg("Replacing %s with %s" % (oldt, newt))
        notes.append("replaced {{%s}} with %s" % (name, newt))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #53
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "noun form":
      if getparam(t, "3"):
        pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" %
            unicode(t))
        return
      rmparam(t, "1")
      rmparam(t, "2")
      head = getrmparam(t, "head")
      head2 = getrmparam(t, "head2")
      tr = getrmparam(t, "tr")
      tr2 = getrmparam(t, "tr2")
      g = getrmparam(t, "g")
      g2 = getrmparam(t, "g2")
      g3 = getrmparam(t, "g3")
      if len(t.params) > 0:
        pagemsg("WARNING: Extra params in noun form template: %s" %
            unicode(t))
        return
      t.name = "ru-noun form"
      if head or g:
        t.add("1", head)
      if head2:
        t.add("head2", head2)
      if g:
        t.add("2", g)
      if g2:
        t.add("g2", g2)
      if g3:
        t.add("g3", g3)
      if tr:
        t.add("tr", tr)
      if tr2:
        t.add("tr2", tr2)
      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))
        notes.append("convert {{head|ru|noun form}} to {{ru-noun form}}")
    elif unicode(t.name) == "ru-noun form":
      if getparam(t, "head") and getparam(t, "1"):
        pagemsg("WARNING: ru-noun form has both params 1= and head=: %s" %
            unicode(t))
        return
      if getparam(t, "g") and getparam(t, "2"):
        pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" %
            unicode(t))
        return
      head = getrmparam(t, "1") or getrmparam(t, "head")
      head2 = getrmparam(t, "head2")
      tr = getrmparam(t, "tr")
      tr2 = getrmparam(t, "tr2")
      g = getrmparam(t, "2") or getrmparam(t, "g")
      g2 = getrmparam(t, "g2")
      g3 = getrmparam(t, "g3")
      if len(t.params) > 0:
        pagemsg("WARNING: Extra params in noun form template: %s" %
            unicode(t))
        return
      if head or g:
        t.add("1", head)
      if head2:
        t.add("head2", head2)
      if g:
        t.add("2", g)
      if g2:
        t.add("g2", g2)
      if g3:
        t.add("g3", g3)
      if tr:
        t.add("tr", tr)
      if tr2:
        t.add("tr2", tr2)
      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))
        notes.append("canonicalize ru-noun form")

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #54
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if unicode(t.name) == "fr-adj":
      g = getparam(t, "1")
      if g and g != "mf":
        pagemsg("WARNING: Strange value 1=%s, removing: %s" % (g, unicode(t)))
        rmparam(t, "1")
        notes.append("remove bogus 1=%s" % g)
        g = None
      inv = getparam(t, "inv")
      if inv:
        if inv not in ["y", "yes", "1"]:
          pagemsg("WARNING: Strange value inv=%s: %s" % (inv, unicode(t)))
        if (getparam(t, "1") or getparam(t, "f") or
            getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")):
          pagemsg("WARNING: Found extraneous params with inv=: %s" %
              unicode(t))
        continue
      if (getparam(t, "f2") or getparam(t, "mp2") or getparam(t, "fp2")
          or getparam(t, "p2")):
        pagemsg("Skipping multiple feminines or plurals: %s" % unicode(t))
        continue
      expected_mp = (pagetitle if re.search("[sx]$", pagetitle)
          else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al")
          else pagetitle + "s")
      if getparam(t, "mp") == expected_mp:
        rmparam(t, "mp")
        notes.append("remove redundant mp=")
      expected_fem = (pagetitle if pagetitle.endswith("e")
          else pagetitle + "ne" if pagetitle.endswith("en")
          else re.sub("er$", u"ère", pagetitle) if pagetitle.endswith("er")
          else pagetitle + "le" if pagetitle.endswith("el")
          else pagetitle + "ne" if pagetitle.endswith("on")
          else pagetitle + "te" if pagetitle.endswith("et")
          else pagetitle + "e" if pagetitle.endswith("ieur")
          else re.sub("teur$", "trice", pagetitle) if pagetitle.endswith("teur")
          else re.sub("eur$", "euse", pagetitle) if pagetitle.endswith("eur")
          else re.sub("eux$", "euse", pagetitle) if pagetitle.endswith("eux")
          else re.sub("if$", "ive", pagetitle) if pagetitle.endswith("if")
          else re.sub("c$", "que", pagetitle) if pagetitle.endswith("c")
          else pagetitle + "e")
      if re.search("(el|on|et|[^i]eur|eux|if|c)$", pagetitle) and not getparam(t, "f") and g != "mf":
        pagemsg("WARNING: Found suffix -el/-on/-et/-[^i]eur/-eux/-if/-c and no f= or 1=mf: %s" % unicode(t))
      if getparam(t, "f") == expected_fem:
        rmparam(t, "f")
        notes.append("remove redundant f=")
      fem = getparam(t, "f") or expected_fem
      if not fem.endswith("e"):
        if not getparam(t, "fp"):
          pagemsg("WARNING: Found f=%s not ending with -e and no fp=: %s" %
              (fem, unicode(t)))
        continue
      expected_fp = fem + "s"
      if getparam(t, "fp") == expected_fp:
        rmparam(t, "fp")
        notes.append("remove redundant fp=")
      if getparam(t, "fp") and not getparam(t, "f"):
        pagemsg("WARNING: Found fp=%s and no f=: %s" % (getparam(t, "fp"),
          unicode(t)))
        continue
      if getparam(t, "fp") == fem:
        pagemsg("WARNING: Found fp=%s same as fem=%s: %s" % (getparam(t, "fp"),
          fem, unicode(t)))
        continue
      if pagetitle.endswith("e") and not getparam(t, "f") and not getparam(t, "fp"):
        if g == "mf":
          rmparam(t, "1")
          notes.append("remove redundant 1=mf")
        g = "mf"
      if g == "mf":
        f = getparam(t, "f")
        if f:
          pagemsg("WARNING: Found f=%s and 1=mf: %s" % (f, unicode(t)))
        mp = getparam(t, "mp")
        if mp:
          pagemsg("WARNING: Found mp=%s and 1=mf: %s" % (mp, unicode(t)))
        fp = getparam(t, "fp")
        if fp:
          pagemsg("WARNING: Found fp=%s and 1=mf: %s" % (fp, unicode(t)))
        if f or mp or fp:
          continue
        expected_p = (pagetitle if re.search("[sx]$", pagetitle)
            else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al")
            else pagetitle + "s")
        if getparam(t, "p") == expected_p:
          rmparam(t, "p")
          notes.append("remove redundant p=")
      elif getparam(t, "p"):
        pagemsg("WARNING: Found unexpected p=%s: %s" % (getparam(t, "p"),
          unicode(t)))
      if not re.search("[ -]", pagetitle) and (getparam(t, "f") or
          getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")):
        pagemsg("Found remaining explicit feminine or plural in single-word base form: %s"
            % unicode(t))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #55
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'conjugation of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          t.name = "inflection of"
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("converted 'conjugation of' to 'inflection of'")
      sections[j] = unicode(parsed)

      # Try to split 'inflection of' containing 'present or future' into two
      # defns
      newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$",
          r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}",
          sections[j], 0, re.M)
      if newsec != sections[j]:
        notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'")
        sections[j] = newsec

      # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and
      # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv',
      # 'impr' to 'imp'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          for frm, to in [
              ("indc", "ind"), ("indicative", "ind"),
              ("futr", "fut"), ("future", "fut"),
              ("impr", "imp"), ("imperative", "imp"),
              ("perfective", "pfv"), ("(perfective)", "pfv"),
              ("imperfective", "impfv"), ("(imperfective)", "impfv"),
              ("singular", "s"), ("(singular)", "s"),
              ("plural", "p"), ("(plural)", "p"),
              ("masculine", "m"), ("(masculine)", "m"),
              ("feminine", "f"), ("(feminine)", "f"),
              ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"),
              ]:
            origt = unicode(t)
            for i in xrange(3,20):
              val = getparam(t, str(i))
              if val == frm:
                t.add(str(i), to)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s' form code to '%s'" % (frm, to))
      sections[j] = unicode(parsed)

      # Remove blank form codes and canonicalize position of lang=, tr=
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          # Fetch the numbered params starting with 3, skipping blank ones
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          # Fetch param 1 and param 2, and non-numbered params except lang=
          # and nocat=.
          param1 = getparam(t, "1")
          param2 = getparam(t, "2")
          tr = getparam(t, "tr")
          nocat = getparam(t, "nocat")
          non_numbered_params = []
          for param in t.params:
            pname = unicode(param.name)
            if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
              non_numbered_params.append((pname, param.value))
          # Erase all params.
          del t.params[:]
          # Put back lang, param 1, param 2, tr, then the replacements for the
          # higher numbered params, then the non-numbered params.
          t.add("lang", "ru")
          t.add("1", param1)
          t.add("2", param2)
          if tr:
            t.add("tr", tr)
          for i, param in enumerate(numbered_params):
            t.add(str(i+3), param)
          for name, value in non_numbered_params:
            t.add(name, value)
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("removed any blank form codes and maybe rearranged lang=, tr=")
            if nocat:
              notes.append("removed nocat=")
      sections[j] = unicode(parsed)

      # Try to canonicalize 'inflection of' involving the imperative,
      # present, future
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          numparamset = set(numbered_params)
          canon_params = []
          while True:
            if numparamset == {'s', 'pfv', 'imp'}:
              canon_params = ['2', 's', 'pfv', 'imp']
            elif numparamset == {'s', 'impfv', 'imp'}:
              canon_params = ['2', 's', 'impfv', 'imp']
            elif numparamset == {'s', 'imp'}:
              canon_params = ['2', 's', 'imp']
            elif numparamset == {'p', 'pfv', 'imp'}:
              canon_params = ['2', 'p', 'pfv', 'imp']
            elif numparamset == {'p', 'impfv', 'imp'}:
              canon_params = ['2', 'p', 'impfv', 'imp']
            elif numparamset == {'p', 'imp'}:
              canon_params = ['2', 'p', 'imp']
            elif numparamset == {'m', 's', 'past'}:
              canon_params = ['m', 's', 'past', 'ind']
            elif numparamset == {'f', 's', 'past'}:
              canon_params = ['f', 's', 'past', 'ind']
            elif numparamset == {'n', 's', 'past'}:
              canon_params = ['n', 's', 'past', 'ind']
            elif numparamset == {'p', 'past'}:
              canon_params = ['p', 'past', 'ind']
            else:
              m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr)
              if m:
                canon_params = [m.group(1), m.group(2), m.group(3), "ind"]
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified participial inflection
      def add_participle_inflection_of(m):
        prefix = m.group(1)
        tense = m.group(2).lower()
        if tense == "present":
          tense = "pres"
        voice = m.group(3).lower()
        if voice == "active":
          voice = "act"
        elif voice == "passive":
          voice = "pass"
        elif voice == "adverbial":
          voice = "adv"
        lemma = m.group(4)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice))
        return retval
      newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified past inflection
      def add_past_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "")
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or ""))
        return retval
      newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified imperative inflection
      def add_imper_inflection_of(m):
        prefix = m.group(1)
        number = {"singular":"s", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for 2/%s/imp" % number)
        return retval
      newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified finite pres/fut inflection
      def add_pres_fut_inflection_of(m):
        prefix = m.group(1)
        person = m.group(2)[0]
        number = {"singular":"s", "plural":"p"}[m.group(3).lower()]
        tense = {"present":"pres", "future":"fut"}[m.group(4).lower()]
        lemma = m.group(5)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense))
        return retval
      newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of,
          sections[j], 0, re.I)
      sections[j] = newsec

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #56
0
def create_declension(page, index, save, pos, tempname, decltempname, sgnum,
    removeparams, is_proper=False):
  pagename = page.title()
  comments = []

  def pgmsg(text):
    msg("Page %s %s: %s" % (index, pagename, text))

  # Starts with definite article al-
  def starts_with_al(text):
    return re.match(ALIF_ANY + A + "?" + L, text)

  def sub_if(fr, to, text):
    if re.search(fr, text):
      return re.sub(fr, to, text)
    else:
      return ""

  # Remove definite article al- from text
  def remove_al(text):
    return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text)
        or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text)
        or text)

  # Remove definite article al- from transliterated text
  def remove_al_tr(text):
    return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or
        sub_if("^a?l-", "", text) or
        text)

  # Split off interwiki links at end
  m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
      page.text, re.S)
  if m:
    pagebody = m.group(1)
    pagetail = m.group(2)
  else:
    pagebody = page.text
    pagetail = ""

  # Split top-level sections (by language)
  splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M)

  # Extract off head and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Look for Arabic section
  for seci in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[seci], re.M)
    if not m:
      pgmsg("Can't find language name in text: [[%s]]" % (sections[seci]))
    elif m.group(1) == "Arabic":
      # Extract off trailing separator
      mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S)
      if mm:
        secbody = mm.group(1)
        sectail = mm.group(2)
      else:
        secbody = sections[seci]
        sectail = ""

      # Split into subsections based on headers
      subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

      # Go through each subsection
      for j in xrange(len(subsections)):
        notes = []

        def add_note(note):
          if note not in notes:
            notes.append(note)

        # Look for subsections matching the given POS
        if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]):
          # Call reorder_shadda here so the templates we work with have
          # shadda in correct order but we don't mess with other text to
          # avoid unnecessary saving
          parsed = blib.parse_text(reorder_shadda(subsections[j]))

          def pagemsg(text):
            pgmsg("%s: [[%s]]" % (text, subsections[j]))

          # Check for various conditions causing us to skip this entry and
          # not try to add a declension table

          # Skip declension if certain templates found in definition.
          # We don't check for {{alternative form of|...}}, because it's
          # used for e.g. different ways of spelling "camera" in Arabic,
          # some with -ā and some with -a, so we still want to create
          # declensions for those.
          altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["alternative spelling of"]]
          if len(altspelling_templates) > 0:
            pagemsg("Alternative spelling redirect found in text, skipping")
            continue
          if pos == "Adjective":
            feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in
                ["feminine of"]]
            if len(feminine_of_templates) > 0:
              pagemsg("feminine-of template found for adjective, skipping")
              continue

          # Retrieve headword_template, make sure exactly one and it is the right type
          headword_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun",
                "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl",
                "ar-noun-inf-cons", "ar-adj-inf-def",
                "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba",
                "ar-adj-sound", "ar-adj-in", "ar-adj-an"]]
          if len(headword_templates) == 0:
            pagemsg("WARNING: Can't find headword template in text, skipping")
            continue
          if len(headword_templates) > 1:
            pagemsg("WARNING: Found multiple headword templates in text, skipping")
            continue
          headword_template = headword_templates[0]
          if headword_template.name != tempname:
            pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name))
            continue
          def getp(param):
            return getparam(headword_template, param)
          # NOTE: We physically add and remove parameters from the headword
          # template to get the list of parameters to use in creating the
          # declension template. These changes don't get propagated to the
          # headword template because we don't convert the parsed text back
          # to a string.
          def putp(param, value):
            addparam(headword_template, param, value)
          head = getp("1")
          orighead = head

          # Check for declension already present
          if (j + 1 < len(subsections) and
              re.match("^===+Declension===+\n", subsections[j + 1])
              or j + 3 < len(subsections) and
              re.match("^===+Usage", subsections[j + 1]) and
              re.match("^===+Declension===+\n", subsections[j + 3])
              ):
            pagemsg("Declension already found for head %s, skipping" % head)
            continue

          # Check for cpl
          # FIXME: Convert cpl into pl and fpl
          if getp("cpl"):
            pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head))
            continue

          # Check for empty head. If w/o explicit translit, skip; else,
          # fetch head from page title.
          if not head:
            if not getp("tr"):
              pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping")
              continue
            else:
              pagemsg("Headword template head is empty but has explicit translit")
              add_note("empty head, using page name")
            head = pagename
            putp("1", head)

          # Try to handle cases with a modifier; we can't handle all of them yet
          headspace = False
          if ' ' in head:
            headspace = True
            words = re.split(r"\s", remove_links(head))
            head = words[0]
            if len(words) > 2:
              pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead)
              continue
            assert(len(words) == 2)

            # Check for params we don't yet know how to handle
            must_continue = False
            for badparam in ["pl2", "pltr", "head2", "sing", "coll"]:
              if getp(badparam):
                # FIXME
                pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam))
                must_continue = True
                break
            if must_continue:
              continue

            # Now check for various types of construction, all either
            # construct (ʾidāfa) or adjectival

            def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext):
              if word.endswith(undia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, undiatext))
                return re.sub(undia + "$", "", word)
              if word.endswith(udia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, udiatext))
                return re.sub(udia + "$", "", word)
              if re.search(DIACRITIC_ANY_BUT_SH + "$", word):
                pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word))
              if word[0] == ALIF_WASLA:
                pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word))
                add_note("changing %s alif wasla to plain alif" % (nomgen))
                word = ALIF + word[1:]
              return word

            def remove_gen_i3rab(word):
              return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I")

            def remove_nom_i3rab(word):
              return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U")

            def remove_gen_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i")

            def remove_nom_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u")

            idafa = False
            word0al = starts_with_al(words[0])
            word1al = starts_with_al(words[1])
            words[0] = remove_al(words[0])
            words[1] = remove_al(words[1])
            putp("1", words[0])
            putp("mod", words[1])
            if word0al and word1al:
              pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead))
              add_note("modifier definite adjective construction")
              putp("state", "def")
            elif word0al and not word1al:
              pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead))
              continue
            elif is_proper:
              if words[0].endswith(ALIF) and word1al:
                pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead))
                add_note("modifier proper noun + definite adjective construction")
                putp("state", "ind-def")
              elif remove_diacritics(words[0]) == u"جمهورية":
                if word1al:
                  pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead))
                  add_note("modifier definite idafa construction")
                  idafa = True
                  assert sgnum == "sg"
                  idafaval = "def"
                  putp("idafa", idafaval)
                elif words[1].endswith(ALIF):
                  pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead))
                  add_note("modifier proper-noun ind-def idafa construction")
                  assert sgnum == "sg"
                  idafaval = "ind-def"
                  putp("idafa", idafaval)
                else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead))
                  continue
              else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead))
                  continue

            elif not word0al and word1al:
              # Found an ʾidāfa construction
              pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead))
              add_note("modifier definite idafa construction")
              idafa = True
              idafaval = "def-" + sgnum
              if idafaval == "def-sg":
                idafaval = "def"
              putp("idafa", idafaval)
            elif words[1].endswith(I + Y):
              pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead))
              continue
            elif words[1].endswith(I + Y + SH):
              pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead))
              add_note("modifier indefinite nisba adjective construction")
            elif pagename in adjectival_phrases:
              pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead))
              add_note("modifier indefinite adjective construction")
            else:
              pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead))
              add_note("modifier indefinite idafa construction")
              idafa = True
              putp("idafa", sgnum)

            # Now remove any i3rab diacritics
            putp("1", remove_nom_i3rab(getp("1")))
            if idafa:
              putp("mod", remove_gen_i3rab(getp("mod")))
            else:
              putp("mod", remove_nom_i3rab(getp("mod")))

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s has space in it and is plural" % (orighead))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead))
                continue
              putp("pl", getp("1"))
              putp("1", "-")
              if not idafa:
                putp("modpl", getp("mod"))
                putp("mod", "-")

            # Now check if lemma has plural specified
            elif getp("pl"):
              pls = re.split(r"\s", remove_links(getp("pl")))
              assert(len(pls) == 2)
              pls[0] = remove_al(pls[0])
              pls[1] = remove_al(pls[1])
              putp("pl", remove_nom_i3rab(pls[0]))
              if not idafa:
                putp("modpl", remove_nom_i3rab(pls[1]))
              else:
                if pls[1] != getp("mod"):
                  pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod")))

            # Now check if there's manual translit. We need to split the
            # manual translit in two and pair up manual translit with
            # corresponding Arabic words. But first remove -t indicating
            # construct state, and check to see if manual translit is
            # same as auto translit, in which case it's unnecessary.
            if getp("tr"):
              pagemsg("Headword template head %s has space in it and manual translit" % (orighead))
              trwords = re.split(r"\s", getp("tr"))
              assert(len(trwords) == 2)
              trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0]))
              if idafa:
                trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1]))
              else:
                trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1]))
              # Remove any extraneous -t from translit, either from construct
              # state of from removal of i3rab in a feminine noun/adj.
              for i in [0, 1]:
                if words[i].endswith(TAM) and trwords[i].endswith("t"):
                  trwords[i] = trwords[i][0:-1]
                if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"):
                  trwords[i] += "h"
              if ar_translit.tr(words[0]) != trwords[0]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/manual translit")
                putp("1", "%s/%s" % (getp("1"), trwords[0]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/ignored manual translit")
              if ar_translit.tr(words[1]) != trwords[1]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/manual translit")
                putp("mod", "%s/%s" % (getp("mod"), trwords[1]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/ignored manual translit")

          else:
            # no space in head, not dealing with a modifier

            # If has link in it, just remove it
            if '[' in head or ']' in head or '|' in head:
              pagemsg("Headword template head %s has link in it" % (head))
              add_note("removed links from head")
              head = remove_links(head)
              putp("1", head)

            # If starts with definite article, remove article from everything,
            # including transliterations, and set state=def
            if starts_with_al(head):
              pagemsg("Headword template head %s starts with definite article" % (head))
              add_note("definite lemma")
              head = remove_al(head)
              putp("1", head)
              putp("state", "def")
              # Also remove al- from remaining head and pl params
              def check_for_al(param):
                param = remove_links(param)
                value = getparam(headword_template, param)
                if value:
                  if '[' in value or ']' in value or '|' in value:
                    pagemsg("Param %s value %s has link in it" % (param, value))
                    add_note("removed links from %s" % param)
                    value = remove_links(value)
                  putp(param, remove_al(value))
              params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"]
              for param in params_to_check:
                check_for_al(param)
              for i in xrange(2, 10):
                check_for_al("head%s" % i)
                for param in params_to_check:
                  check_for_al("%s%s" % (param, i))
              # Also remove al- from transliteration
              def check_for_al_tr(param):
                value = getparam(headword_template, param)
                if value:
                  putp(param, remove_al_tr(value))
              check_for_al("tr")
              for param in params_to_check:
                check_for_al("%str" % param)
              for i in xrange(2, 10):
                check_for_al("tr%s" % i)
                for param in params_to_check:
                  check_for_al("%s%str" % (param, i))
            elif is_proper:
              if head.endswith(ALIF):
                pagemsg(u"Headword template head %s ends in -ā" % (head))
                putp("state", "ind-def")
              else:
                pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head))
                continue

            if head.endswith(UN):
              pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head))
              add_note("head has explicit i3rab (UN)")
              # We don't continue here because we handle this case below
            elif head.endswith(U):
              pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head))
              add_note("head has explicit i3rab (U)")
              # We don't continue here because we don't need to handle this case

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s is plural" % (head))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head))
                continue
              putp("pl", getp("1"))
              putp("1", "-")

          # Now fetch the parameters from the headword template, removing
          # any that we want to remove, removing the i3rab -UN ending, and
          # adding any specified manual translit as a / annotation.

          def param_should_be_removed(param):
            name = unicode(param.name)
            if name == "sc" and unicode(param.value) == "Arab":
              return True
            if name.endswith("tr"):
              return True
            for remove in removeparams:
              if name == remove:
                return True
              if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name):
                return True
            return False

          def remove_i3rab(param):
            text = unicode(param)
            if text.endswith(UN):
              pgmsg("Removing i3rab from %s: %s" % (text,
                unicode(headword_template)))
              add_note("removing i3rab")
            return re.sub(UN + "$", "", text)

          def trparam(name):
            if name == "1":
              return "tr"
            elif name.startswith("head"):
              return name.replace("head", "tr")
            else:
              return name + "tr"

          def process_param(param):
            arabic = remove_i3rab(param)
            # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in
            # to signal the strong plural.
            if arabic.endswith("=+"):
              newarabic = re.sub(r"=\+$", "=sp", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Value of - is used in ar-adj-in to signal an unknown
            # feminine plural.
            if arabic.endswith("=-"):
              newarabic = re.sub(r"=-$", "=?", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Don't process translit in modifier constructions, where the
            # translit is also processed.
            if not headspace:
              tr = getparam(headword_template, trparam(unicode(param.name)))
              if tr:
                return arabic + "/" + tr
            return arabic

          params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)])
          # For templates that automatically supply the masculine plural,
          # supply it here, too if not overridden.
          if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"):
            params += '|pl=sp'

          # Separate off any [[Category: Foo]] declarators, insert before them
          m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
              subsections[j], re.S)
          if m:
            body = m.group(1)
            tail = m.group(2)
          else:
            body = subsections[j]
            tail = ""
          # Make sure there are two trailing newlines
          if body.endswith("\n\n"):
            pass
          elif body.endswith("\n"):
            body += "\n"
          else:
            body += "\n\n"
          body += (subsections[j - 1].replace(pos, "=Declension=") +
              "{{%s|%s}}\n\n" % (decltempname, params))
          subsections[j] = body + tail
          comment = "added declension for %s %s" % (tempname,
            remove_links(orighead) or "%s/%s" % (pagename, getp("tr")))
          note = ', '.join(notes)
          if note:
            comment = "%s (%s)" % (comment, note)
          comments.append(comment)
          sections[seci] = ''.join(subsections) + sectail
  newtext = pagehead + ''.join(sections) + pagetail
  comment = '; '.join(comments)
  assert((not comment) == (newtext == page.text))
  if newtext != page.text:
    if verbose:
      msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext))
    page.text = newtext
    msg("For page %s, comment = %s" % (pagename, comment))
    if save:
      page.save(comment = comment)
Пример #57
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  manual_ppp_forms = ["past_pasv_part", "past_pasv_part2", "past_pasv_part3",
    "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4"]
  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    tname = unicode(t.name)
    if tname == "ru-conj":
      manual_ppps = []
      for form in manual_ppp_forms:
        ppp = getparam(t, form)
        if ppp and ppp != "-":
          manual_ppps.append(ppp)
      if not manual_ppps:
        continue
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      curvariant = getparam(t, "2")
      if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant:
        pagemsg("WARNING: Found both manual PPP and PPP variant, something wrong: %s" %
            unicode(t))
        continue
      t2 = blib.parse_text(unicode(t)).filter_templates()[0]
      for form in manual_ppp_forms:
        rmparam(t2, form)
      variants_to_try = ["+p"]
      if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]):
        variants_to_try.append(u"+pё")
      if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]:
        variants_to_try.append(u"+pжд")
      notsamemsgs = []
      for variant in variants_to_try:
        t2.add("2", curvariant + variant)
        tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t2))
        result = expand_text(tempcall)
        if not result:
          pagemsg("WARNING: Error generating forms, skipping")
          continue
        args = rulib.split_generate_args(result)
        if "past_pasv_part" not in args:
          pagemsg("WARNING: Something wrong, no past passive participle generated: %s" % unicode(t))
          continue
        auto_ppps = []
        for form in manual_ppp_forms:
          if form in args:
            for ppp in re.split(",", args[form]):
              if ppp and ppp != "-":
                auto_ppps.append(ppp)
        if manual_ppps == auto_ppps:
          pagemsg("Manual PPP's %s same as auto-generated PPP's, switching to auto"
              % ",".join(manual_ppps))
          for form in manual_ppp_forms:
            rmparam(t, form)
          t.add("2", curvariant + variant)
          notes.append("replaced manual PPP's with variant %s" % variant)
          break
        else:
          notsamemsgs.append("WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s" %
            (",".join(manual_ppps), ",".join(auto_ppps), unicode(t)))
      else: # no break in for loop
        for m in notsamemsgs:
          pagemsg(m)

    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Пример #58
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist")
    return

  if ":" in pagetitle and not re.search(
      "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle):
    pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page")
    return

  text = unicode(page.text)
  notes = []

  subsections = re.split("(^==.*==\n)", text, 0, re.M)
  newtext = text

  def move_param(t, fr, to, frob_from=None):
    if t.has(fr):
      oldval = getparam(t, fr)
      if not oldval.strip():
        rmparam(t, fr)
        pagemsg("Removing blank param %s" % fr)
        return
      if frob_from:
        newval = frob_from(oldval)
        if not newval or not newval.strip():
          return
      else:
        newval = oldval

      if getparam(t, to).strip():
          pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s"
              % (fr, to, to, unicode(t)))
      elif oldval != newval:
        rmparam(t, to) # in case of blank param
        # If either old or new name is a number, use remove/add to automatically set the
        # showkey value properly; else it's safe to just change the name of the param,
        # which will preserve its location.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          tfr = t.get(fr)
          tfr.name = to
          tfr.value = newval
        pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to,
          newval.replace("\n", r"\n")))
      else:
        rmparam(t, to) # in case of blank param
        # See comment above.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          t.get(fr).name = to
        pagemsg("%s -> %s" % (fr, to))

  def fix_page_params(t):
    origt = unicode(t)
    for param in ["page", "pages"]:
      pageval = getparam(t, param)
      if re.search(r"^\s*pp?\.\s*", pageval):
        pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval)
        t.add(param, pageval)
        notes.append("remove p(p). from %s=" % param)
        pagemsg("remove p(p). from %s=" % param)
    if re.search(r"^[0-9]+$", getparam(t, "pages").strip()):
      move_param(t, "pages", "page")
    if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()):
      move_param(t, "page", "pages")
    return origt != unicode(t)

  def fix_cite_book_params(t):
    origt = unicode(t)
    if getparam(t, "origyear").strip() and getparam(t, "year").strip():
      if getparam(t, "year_published"):
        pagemsg("WARNING: Would set year_published= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "year_published") # in case of blank param
        t.get("year").name = "year_published"
        t.get("origyear").name = "year"
        pagemsg("year -> year_published, origyear -> year")
    move_param(t, "origdate", "date")
    move_param(t, "origmonth", "month")
    def frob_isbn(idval):
      isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)"
      if re.search(isbn_re, idval, re.I):
        return re.sub(isbn_re, r"\1", idval, 0, re.I)
      elif re.search(r"^[0-9]", idval.strip()):
        return idval
      else:
        pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" %
            idval.replace("\n", r"\n"))
        return None
    move_param(t, "id", "isbn", frob_isbn)
    fix_page_params(t)
    return origt != unicode(t)

  def fix_cite_usenet_params(t):
    origt = unicode(t)
    move_param(t, "group", "newsgroup")
    move_param(t, "link", "url")
    return origt != unicode(t)

  def fix_quote_usenet_params(t):
    origt = unicode(t)
    monthday = getparam(t, "monthday").strip()
    year = getparam(t, "year").strip()
    if monthday and year:
      if getparam(t, "date"):
        pagemsg("WARNING: Would set date= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "date") # in case of blank param
        param = t.get("monthday")
        param.name = "date"
        if re.search("^[0-9]+/[0-9]+$", monthday):
          param.value = "%s/%s" % (monthday, year)
        else:
          param.value = "%s %s" % (monthday, year)
        rmparam(t, "year")
        pagemsg("monthday/year -> date")
    move_param(t, "group", "newsgroup")
    move_param(t, "text", "passage")
    move_param(t, "6", "passage")
    move_param(t, "5", "url")
    move_param(t, "4", "newsgroup")
    move_param(t, "3", "title")
    move_param(t, "2", "author")
    move_param(t, "1", "date")
    return origt != unicode(t)

  def replace_in_reference(parsed, in_what):
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "cite-journal", tname)
        pagemsg("%s -> cite-journal" % tname.strip())
        notes.append("%s -> cite-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))
      if tname.strip() == "reference-book":
        set_template_name(t, "cite-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))

  for j in xrange(0, len(subsections), 2):
    parsed = blib.parse_text(subsections[j])
    if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]):
      replace_in_reference(parsed, "==References== section")
      subsections[j] = unicode(parsed)
    else:
      for t in parsed.filter_tags():
        if unicode(t.tag) == "ref":
          tagparsed = mw.wikicode.Wikicode([t])
          replace_in_reference(tagparsed, "<ref>")
          subsections[j] = unicode(parsed)
    need_to_replace_double_quote_prefixes = False
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      for fr, to in simple_replace:
        if tname.strip() == fr:
          set_template_name(t, to, tname)
          pagemsg("%s -> %s" % (fr, to))
          notes.append("%s -> %s" % (fr, to))
          fix_page_params(t)
          pagemsg("Replacing %s with %s" % (origt, unicode(t)))
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "quote-journal", tname)
        pagemsg("%s -> quote-journal" % tname.strip())
        notes.append("%s -> quote-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() == "reference-book":
        set_template_name(t, "quote-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() in ["cite-usenet", "quote-usenet"]:
        if tname.strip() == "cite-usenet":
          fixed_params = fix_cite_usenet_params(t)
        else:
          fixed_params = fix_quote_usenet_params(t)
        set_template_name(t, "quote-newsgroup", tname)
        pagemsg("%s -> quote-newsgroup" % tname.strip())
        prefix = getparam(t, "prefix").strip()
        removed_prefix = False
        if prefix:
          if prefix in ["#", "#*"]:
            parsed.insert_before(t, "#* ")
            rmparam(t, "prefix")
            pagemsg("remove prefix=%s, insert #* before template" % prefix)
            need_to_replace_double_quote_prefixes = True
            removed_prefix = True
          else:
            pagemsg("WARNING: Found prefix=%s, not # or #*: %s" %
                (prefix, unicode(t)))
        notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(),
          removed_prefix and
            ", remove prefix=%s, insert #* before template" % prefix or "",
          fixed_params and ", fix params" or ""))
        pagemsg("Replacing %s with %s" % (origt, unicode(t)))
    subsections[j] = unicode(parsed)
    if need_to_replace_double_quote_prefixes:
      newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M)
      if newval != subsections[j]:
        notes.append("remove double #* prefix")
        pagemsg("Removed double #* prefix")
      subsections[j] = newval
  newtext = "".join(subsections)

  if text != newtext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, newtext))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)