Пример #1
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  text = unicode(page.text)
  if pagetitle.startswith("Module:"):
    return

  pagemsg("Processing")
  notes = []

  # WARNING: Not idempotent.

  to_add_period = []

  for t in parsed.filter_templates():
    tn = tname(t)
    if tn == "place" and not t.has("t") and not t.has("t1") and not t.has("t2") and not t.has("t3"):
      to_add_period.append(unicode(t))

  for curr_template in to_add_period:
    repl_template = curr_template + "."
    newtext, did_replace = blib.replace_in_text(text, curr_template, repl_template, pagemsg)
    if did_replace:
      newtext = re.sub(re.escape(curr_template) + r"\.([.,])", curr_template + r"\1", newtext)
      if newtext != text:
        notes.append("add period to {{place}} template (formerly automatically added)")
        text = newtext

  return text, notes
Пример #2
0
    def add_adj_form_of(secbody, pos, comparative_superlative_t, ending):
        lemma = getparam(comparative_superlative_t, "2")
        if check_if_lemma_and_ending_match_pagetitle(lemma,
                                                     ending,
                                                     pagetitle,
                                                     allow_umlaut=False):
            form_pos = "superlative adjective form" if pos == "superlative" else "adjective form"
            newsec = """

===Adjective===
{{head|de|%s}}

# {{de-adj form of|%s}}""" % (form_pos, lemma)
            secbody, replaced = blib.replace_in_text(
                secbody,
                unicode(comparative_superlative_t),
                unicode(comparative_superlative_t) + newsec,
                pagemsg,
                abort_if_warning=True)
            if not replaced:
                pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" %
                        (ending, unicode(comparative_of_t)))
                return secbody, False
            notes.append("add {{de-adj form of}} for %s" % pos)
        else:
            pagemsg(
                "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" %
                (lemma, pos, ending))
        return secbody, True
Пример #3
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []
    lines = re.split("\n", text)
    newlines = []
    langs_at_levels = {}
    kurdish_indent = None
    kurdish_borrowing = None
    for line in lines:
        thisline_lang = None
        m = re.search("^([*]+:*)", line)
        if m:
            thisline_indent = len(m.group(1))
            if kurdish_indent and thisline_indent <= kurdish_indent:
                kurdish_indent = None
            if "{{desc|" in line or "{{desctree|" in line:
                parsed = blib.parse_text(line)
                for t in parsed.filter_templates():
                    tn = tname(t)
                    if tn in ["desc", "desctree"]:
                        thisline_lang = getparam(t, "1")
                        if thisline_lang == "ku":
                            if getparam(t, "2") != "-":
                                pagemsg(
                                    "WARNING: Saw real 'Kurdish' descendant rather than anchoring line: %s"
                                    % unicode(t))
                                continue
                            kurdish_indent = thisline_indent
                            kurdish_borrowing = getparam(t, "bor")
                            line, did_replace = blib.replace_in_text(
                                line, unicode(t), "Kurdish:", pagemsg)
                            notes.append(
                                "replace {{desc|ku}} with raw 'Kurdish:'")
                        elif kurdish_indent and thisline_indent > kurdish_indent and kurdish_borrowing:
                            t.add("bor", "1")
                            line = unicode(parsed)
                            notes.append(
                                "add bor=1 to Kurdish-language (%s) descendant"
                                % thisline_lang)
        else:
            kurdish_indent = None
        newlines.append(line)
    newtext = "\n".join(newlines)
    return newtext, notes
Пример #4
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not re.search(
            r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form",
            text):
        return

    pagemsg("Processing")

    notes = []

    retval = blib.find_modifiable_lang_section(text, "German", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find German section")
        return
    sections, j, secbody, sectail, has_non_lang = retval

    if re.search("== *Etymology 1 *==", secbody):
        pagemsg("WARNING: Multiple etymology sections, skipping")
        return

    parsed = blib.parse_text(secbody)

    headt = None
    comparative_of_t = None
    superlative_of_t = None
    inflection_of_t = None
    need_superlative_of_t_lemma = None
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        def do_comparative_superlative_of(pos, existing_t, should_end):
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{%s of}}, skipping: %s" %
                    (pos, origt))
                return False
            if existing_t:
                pagemsg(
                    "WARNING: Saw two {{%s of}} templates, skipping: %s and %s"
                    % (pos, unicode(existing_t), origt))
                return False
            if not headt:
                pagemsg(
                    "WARNING: Saw {{%s of}} without head template, skipping: %s"
                    % (pos, origt))
                return False
            if not pagetitle.endswith(should_end):
                pagemsg(
                    "WARNING: Incorrect ending for %s, should be -%s, skipping"
                    % (pos, should_end))
                return False
            param2 = getparam(headt, "2")
            if param2 != "%s adjective" % pos:
                headt.add("2", "%s adjective" % pos)
                notes.append(
                    "convert {{head|de|%s}} to {{head|de|%s adjective}}" %
                    (param2, pos))
            return t

        if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [
                "adjective form", "adjective comparative form",
                "adjective superlative form", "participle form"
        ]:
            if headt:
                pagemsg(
                    "WARNING: Saw two head templates, skipping: %s and %s" %
                    (unicode(headt), origt))
                return
            headt = t
        elif tn == "head" and getparam(t, "1") == "de" and getparam(
                t, "2") == "verb form":
            pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt)
        elif tn == "head":
            pagemsg("WARNING: Saw unrecognized head template, skipping: %s" %
                    origt)
            return
        elif tn == "comparative of":
            comparative_of_t = do_comparative_superlative_of(
                "comparative", comparative_of_t, "er")
            if not comparative_of_t:
                return
        elif tn == "superlative of":
            superlative_of_t = do_comparative_superlative_of(
                "superlative", superlative_of_t, "sten")
            if not superlative_of_t:
                return
        elif tn == "de-adj form of":
            pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" %
                    origt)
            return
        elif tn in ["inflection of", "infl of"]:
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{inflection of}}, skipping: %s"
                    % origt)
                return
            if not headt:
                pagemsg(
                    "WARNING: Saw {{inflection of}} without head template, skipping: %s"
                    % origt)
                return
            if inflection_of_t:
                pagemsg(
                    "WARNING: Saw {{inflection of}} twice, skipping: %s and %s"
                    % (unicode(inflection_of_t), origt))
                return
            inflection_of_t = t
            lemma = getparam(t, "2")
            if getparam(t, "3"):
                pagemsg(
                    "WARNING: Saw alt form in {{inflection of}}, skipping: %s"
                    % origt)
                return
            infl_tags = []
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn):
                    pagemsg(
                        "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s"
                        % (pn, pv, origt))
                    return
                if int(pn) >= 4:
                    infl_tags.append(pv)
            tags = "|".join(infl_tags)
            if tags not in tags_to_ending:
                pagemsg(
                    "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s"
                    % origt)
                return
            del t.params[:]
            ending = tags_to_ending[tags]
            if ending in ["sten", "esten"]:
                need_superlative_of_t_lemma = lemma
            blib.set_template_name(t, "de-adj form of")
            t.add("1", lemma)

            no_explicit = check_if_lemma_and_ending_match_pagetitle(
                lemma, ending, pagetitle, allow_umlaut=True)
            if not no_explicit:
                pagemsg("WARNING: Explicit ending %s required for lemma %s" %
                        (ending, lemma))
                t.add("2", ending)
            notes.append(
                "convert {{inflection of|de|...}} to {{de-adj form of}}")
            if "comd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "comparative adjective form":
                    headt.add("2", "comparative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|comparative adjective form}}"
                        % param2)
            elif "supd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "superlative adjective form":
                    headt.add("2", "superlative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|superlative adjective form}}"
                        % param2)

    secbody = unicode(parsed)

    def add_adj_form_of(secbody, pos, comparative_superlative_t, ending):
        lemma = getparam(comparative_superlative_t, "2")
        if check_if_lemma_and_ending_match_pagetitle(lemma,
                                                     ending,
                                                     pagetitle,
                                                     allow_umlaut=False):
            form_pos = "superlative adjective form" if pos == "superlative" else "adjective form"
            newsec = """

===Adjective===
{{head|de|%s}}

# {{de-adj form of|%s}}""" % (form_pos, lemma)
            secbody, replaced = blib.replace_in_text(
                secbody,
                unicode(comparative_superlative_t),
                unicode(comparative_superlative_t) + newsec,
                pagemsg,
                abort_if_warning=True)
            if not replaced:
                pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" %
                        (ending, unicode(comparative_of_t)))
                return secbody, False
            notes.append("add {{de-adj form of}} for %s" % pos)
        else:
            pagemsg(
                "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" %
                (lemma, pos, ending))
        return secbody, True

    if comparative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t,
                                      "er")
        if not ok:
            return

    if superlative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t,
                                      "sten")
        if not ok:
            return

    if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma:
        cursec = """===Adjective===
{{head|de|superlative adjective form}}

# %s""" % unicode(inflection_of_t)
        newsec = """===Adjective===
{{head|de|superlative adjective}}

# {{superlative of|de|%s}}

""" % need_superlative_of_t_lemma
        secbody, replaced = blib.replace_in_text(secbody,
                                                 cursec,
                                                 newsec + cursec,
                                                 pagemsg,
                                                 abort_if_warning=True)
        if not replaced:
            pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" %
                    unicode(inflection_of_t))
            return
        notes.append("add {{superlative of|de|...}}")

    sections[j] = secbody + sectail
    text = "".join(sections)

    if not notes:
        pagemsg("WARNING: Couldn't convert page")

    return text, notes
Пример #5
0
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg):
  notes = []

  def analyze_declts(declts, pagetitle, headword_gens, headword_pls):
    decl_genders_gens_and_pls = []
    prev_is_weak = None
    prev_is_sg = None
    for declt in declts:
      def getp(param):
        return getparam(declt, param)
      tn = tname(declt)
      gender = re.sub(".*-", "", tn)
      if gender == "pl":
        gender = "p"
      decl_gens = []
      decl_pls = []
      if gender != "p":
        is_weak = False
        is_sg = False
        for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]:
          if getp(param):
            pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt)))
            return None
        if gender in ["m", "n"]:
          arg1 = getp("1")
          if not arg1:
            gen = ""
          elif arg1 in ["n", "ns", "en", "ens"]:
            is_weak = True
            gen = arg1
          elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]:
            gen = arg1
          else:
            pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt)))
            return None
          decl_gens = convert_gens(pagetitle, [gen], from_decl=True)
        num = getp("n")
        if num == "sg":
          is_sg = True
        elif num not in ["full", ""]:
          pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt)))
          return None
        if not is_sg:
          if gender == "f":
            plsuffix = getp("1")
          else:
            plsuffix = getp("2")
          argpl = getp("pl")
          if argpl:
            pl = argpl
          else:
            pl = pagetitle + plsuffix
          if pl == "-":
            is_sg = True
          else:
            decl_pls = normalize_values([pl])
        if prev_is_weak is not None and prev_is_weak != is_weak:
          pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s"
              % (is_weak, prev_is_weak, declts_to_unicode(declts)))
          return None
        prev_is_weak = is_weak
        if prev_is_sg is not None and prev_is_sg != is_sg:
          pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s"
              % (is_sg, prev_is_sg, declts_to_unicode(declts)))
          return None
        prev_is_sg = is_sg
      decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls))

    all_decl_genders = []
    all_decl_gens = []
    all_decl_pls = []
    for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls:
      if decl_gender not in all_decl_genders:
        all_decl_genders.append(decl_gender)
      for decl_gen in decl_gens:
        if decl_gen not in all_decl_gens:
          all_decl_gens.append(decl_gen)
      for decl_pl in decl_pls:
        if decl_pl not in all_decl_pls:
          all_decl_pls.append(decl_pl)
    first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0]
    if len(all_decl_genders) > 1 and (
      len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls)
    ):
      pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s"
          % declts_to_unicode(declts))
      return None
    if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls):
      pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s"
          % declts_to_unicode(declts))
      return None

    is_weak = prev_is_weak
    is_sg = prev_is_sg
    declspec = ":".join(all_decl_genders)

    def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc):
      defparts = []
      for gender in all_decl_genders:
        defpart = pagetitle + get_default_part(pagetitle, gender, is_weak)
        if defpart not in defparts:
          defparts.append(defpart)
      if all_decl_parts == defparts:
        declspec += ","
      else:
        all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None)
        if set(headword_parts) == set(all_decl_parts):
          headword_part_forms = analyze_forms(pagetitle, headword_parts, None)
          if headword_part_forms != all_decl_part_forms:
            pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s"
                % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms,
                  declts_to_unicode(declts)))
            all_decl_part_forms = headword_part_forms
        else:
          pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing"
              % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts)))
        declspec += ",%s" % all_decl_part_forms
      return declspec

    if "m" in all_decl_genders or "n" in all_decl_genders:
      declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive")
    if "p" not in all_decl_genders:
      declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural")
    declspec = re.sub(",*$", "", declspec)
    if is_weak:
      declspec += ".weak"
    if is_sg:
      declspec += ".sg"
    if ss:
      declspec += ".ss"
    return declspec, all_decl_genders, all_decl_gens, all_decl_pls

  old_style_headt = False
  for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]:
    if getparam(headt, param):
      old_style_headt = True
      break
  if not old_style_headt:
    pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt),
      declts and ", declts=%s" % declts_to_unicode(declts) or ""))
    return notes

  is_proper = tname(headt) == "de-proper noun"
  ss = False
  if declts:
    sses = [not not getparam(declt, "ss") for declt in declts]
    if len(set(sses)) > 1:
      pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts))
      return
    ss = list(set(sses)) == [True]
  if ss:
    if not pagetitle.endswith(u"ß"):
      pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts))
      return
    # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the
    # headword and declension specs.
    pagetitle = re.sub(u"ß$", "ss", pagetitle)

  adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts)
  genders = blib.fetch_param_chain(headt, "1", "g")
  headword_genders = genders
  gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True))
  pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl"))
  dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim"))
  fems = normalize_values(blib.fetch_param_chain(headt, "f"))
  mascs = normalize_values(blib.fetch_param_chain(headt, "m"))
  if gens == [True]:
    gens = []
  for param in headt.params:
    pn = pname(param)
    pv = unicode(param.value)
    if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and (
        not adjectival or pn not in "head"):
      pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt)))
      return
  if not genders:
    pagemsg("WARNING: No genders in head template: %s" % unicode(headt))
    return
  if "p" in genders and len(genders) > 1:
    pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt))
    return
  if "p" in genders and (gens or pls):
    pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt))
    return
  saw_mn = "m" in genders or "n" in genders
  if not saw_mn and not adjectival:
    if gens and gens == [pagetitle]:
      gens = []
    if gens:
      pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt))
      return

  if adjectival:
    if len(declts) > 1:
      pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s"
        % declts_to_unicode(declts))
      return
    declt = declts[0]
    def getp(param):
      return getparam(declt, param)
    tn = tname(declt)
    m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn)
    if m:
      default_equiv = None
      is_sg, gender = m.groups()
      adj = getp("1")
      noun = getp("2")
      if gender in ["m", "f"]:
        default_equiv = adj + ("e" if gender == "m" else "er")
        if noun:
          default_equiv += " " + construct_default_equiv(noun, gender)
      if gender in ["m", "n"]:
        noun_gen = getp("3")
        noun_pl = getp("4")
      else:
        noun_gen = "-"
        noun_pl = getp("3")
      noun_pl_full = getp("pl")
      adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es"
      expected_lemma = adj + adj_ending
      if gender == "f":
        # Should be '-er' but we often see '-en' (weak form) instead
        expected_gens = [adj + "er", adj + "en"]
      else:
        expected_gens = [adj + "en"]
      if is_sg:
        expected_pls = []
      else:
        expected_pls = [adj + "e", adj + "en"]
      if not noun:
        if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"):
          pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt))
          return
        all_decl_genders = [gender]
      else:
        fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "")
        fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0]
        def analyze_headword_parts_for_noun(parts, desc):
          noun_headword_parts = []
          for part in parts:
            m = re.search("^([^ ]+) ([^ ]+)$", part.strip())
            if not m:
              pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s"
                  % (desc, part, unicode(headt), unicode(declt)))
              return []
            part_adj, part_noun = m.groups()
            noun_headword_parts.append(part_noun)
          return noun_headword_parts
        noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive")
        noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural")

        retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls)
        if retval is None:
          return
        declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
        expected_lemma = "%s %s" % (expected_lemma, noun)
        expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)]
        if is_sg:
          expected_pls = []
        else:
          expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls]
      if pagetitle != expected_lemma:
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s"
            % (expected_lemma, pagetitle, unicode(headt), unicode(declt)))
        return
      if set(genders) != set(all_decl_genders):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt)))
        return
      if not (set(gens) <= set(expected_gens)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt)))
        return
      if pls == ["-"]:
        if expected_pls:
          pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
              % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
          return
      elif not (set(pls) <= set(expected_pls)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
        return
      if not noun:
        declspec = "+"
        if is_sg:
          declspec += ".sg"
      else:
        if re.search("^" + CAP, adj):
          adj_lemma = adj.lower()
        else:
          adj_lemma = adj
        if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]:
          adj_lemma += "e"
        adj_form = adj + adj_ending
        if adj_form.startswith(adj_lemma):
          adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):])
        else:
          adj_link = "[[%s|%s]]" % (adj_lemma, adj_form)
        noun_link = "[[%s]]" % noun
        # This is less accurate than the above. Often head= is wrong.
        # Try to update adjective and noun links from head= if given.
        #head = getparam(headt, "head")
        #if head:
        #  m = re.search("^([^ ]*) ([^ ]*)$", head)
        #  if not m:
        #    pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s"
        #        % (head, unicode(headt), unicode(declt)))
        #  else:
        #    head_adj_link, head_noun_link = m.groups()
        #    m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link)
        #    if m:
        #      adj_link_lemma, adj_link_form = m.groups()
        #      if adj_link_form.startswith(adj_link_lemma):
        #        head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):])
        #    if head_adj_link != adj_link:
        #      pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s"
        #          % (head_adj_link, adj_link, unicode(headt), unicode(declt)))
        #      adj_link = head_adj_link
        #    if head_noun_link != noun_link:
        #      pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s"
        #          % (head_noun_link, noun_link, unicode(headt), unicode(declt)))
        #      noun_link = head_noun_link
        declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec)
      headspec = declspec
      is_both = is_proper and not is_sg
    else:
      pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts))
      return

  else: # not adjectival
    if len(genders) == 1 and genders[0] in ["m", "f"]:
      default_equiv = construct_default_equiv(pagetitle, genders[0])
    headspec = ":".join(genders)
    is_sg = False
    is_both = False
    is_weak = False
    headword_gens = []
    headword_pls = []
    if headspec != "p":
      pls = convert_pls(pagetitle, pls, is_proper=is_proper)
      headword_pls = pls
      if saw_mn:
        gens = convert_gens(pagetitle, gens)
        headword_gens = gens
        if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"])
          and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))):
          is_weak = True
        def_gens = []
        for gender in genders:
          def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak)
          if def_gen not in def_gens:
            def_gens.append(def_gen)
        if set(def_gens) == set(gens):
          headspec += ","
        else:
          headspec += ",%s" % analyze_forms(pagetitle, gens, None)
      def_pls = []
      for gender in genders:
        def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak)
        if def_pl not in def_pls:
          def_pls.append(def_pl)
      if set(def_pls) == set(pls):
        headspec += ","
        if is_proper:
          is_both = True
      elif pls == ["-"]:
        is_sg = True
      else:
        headspec += ",%s" % analyze_forms(pagetitle, pls, None)
    headspec = re.sub(",*$", "", headspec)
    if is_weak:
      headspec += ".weak"
    if is_sg:
      headspec += ".sg"
    if ss:
      headspec += ".ss"

  extraspec = ""
  if dims:
    extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",")
  if fems:
    extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",")
  if mascs:
    extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",")

  if declts and not adjectival:
    retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls)
    if retval is None:
      return
    declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
    if headspec != declspec:
      if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls):
        if set(all_decl_genders) == set(headword_genders):
          pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt)))
          declspec = headspec
        else:
          pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))

          return
      else:
        pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s"
            % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
              ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))
        return

  if is_proper:
    headspec = headspec.replace(".sg", "")
    if is_both:
      if ".ss" in headspec:
        headspec = headspec.replace(".ss", ".both.ss")
      else:
        headspec += ".both"
  newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec)
  headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt)
  outmsg = "Would " + headt_outmsg
  if declts:
    newdeclt = "{{de-ndecl|%s}}" % declspec
    declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt)
    outmsg += " and " + declt_outmsg
  pagemsg(outmsg)

  if unicode(headt) != newheadt:
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(headt_outmsg)
    subsections[subsection_with_head] = newsectext
  if declts:
    declts_existing = "\n".join(unicode(declt) for declt in declts)
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(declt_outmsg)
    subsections[subsection_with_declts] = newsectext

  return notes
def process_page(page, index, line, respelling, orig_template, repl_template,
                 args):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if respelling == "-":
        pagemsg("Skipping line with respelling '-': %s" % line)
        return

    if respelling == "":
        pagemsg("WARNING: Skipping blank respelling: %s" % line)
        return

    notes = []

    text = unicode(page.text)
    if orig_template not in text:
        pagemsg("WARNING: Can't find original template %s in text" %
                orig_template)
        return

    m = re.search("^.*?%s.*$" % re.escape(orig_template), text, re.M)
    if not m:
        pagemsg("WARNING: Couldn't find template %s in page text" %
                orig_template)
        textline = "(unknown)"
    else:
        textline = m.group(0)

    m = re.search(r"(\|pos=[a-z]+)", repl_template)
    if m:
        posarg = m.group(1)
    else:
        posarg = ""
    if respelling == "y":
        respellingarg = ""
    else:
        respellingarg = "|" + "|".join(respelling.split(","))
    real_repl = "{{fr-IPA%s%s}}" % (respellingarg, posarg)

    if "{{a|" in textline:
        pagemsg(
            "WARNING: Replacing %s with %s and saw accent spec on line: %s" %
            (orig_template, real_repl, textline))

    newtext, did_replace = blib.replace_in_text(text, orig_template, real_repl,
                                                pagemsg)
    text = newtext
    if did_replace:
        notes.append("semi-manually replace %s with %s" %
                     (orig_template, real_repl))
    if respelling != "y":
        parsed = blib.parse_text(text)
        saw_fr_conj_auto = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "fr-conj-auto":
                if saw_fr_conj_auto:
                    pagemsg(
                        "WARNING: Saw {{fr-conj-auto}} twice, first=%s, second=%s"
                        % (saw_fr_conj_auto, unicode(t)))
                saw_fr_conj_auto = unicode(t)
                if getparam(t, "pron"):
                    pagemsg("WARNING: Already saw pron= param: %s" %
                            unicode(t))
                    continue
                pronarg = ",".join(pron or pagetitle
                                   for pron in respelling.split(","))
                origt = unicode(t)
                t.add("pron", pronarg)
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("add pron=%s to {{fr-conj-auto}}" % pronarg)
        text = unicode(parsed)

    return text, notes
Пример #7
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    text = unicode(page.text)

    if ":" in pagetitle and not re.search(
            "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):",
            pagetitle):
        pagemsg(
            "WARNING: Colon in page title and not a recognized namespace to include, skipping page"
        )
        return None, None

    templates_to_replace = []

    for t in parsed.filter_templates():
        tn = tname(t)

        if tn in sv_verb_templates_with_plural_of and tn in all_templates:
            plural_of = getparam(t, "plural of")
            if plural_of:
                origt = unicode(t)
                rmparam(t, "plural of")
                newt = "{{sv-obs verb pl|%s}} %s" % (plural_of, unicode(t))
                templates_to_replace.append((
                    origt, newt,
                    "move plural of= in {{%s}} to {{sv-obs verb pl}} outside of template"
                    % tn))

        if tn in sv_noun_templates_with_obsoleted_by and tn in all_templates:
            obsoleted_by = getparam(t, "obsoleted by")
            if obsoleted_by:
                origt = unicode(t)
                rmparam(t, "obsoleted by")
                newt = "{{sv-obs noun form|%s}} %s" % (obsoleted_by,
                                                       unicode(t))
                templates_to_replace.append((
                    origt, newt,
                    "move plural of= in {{%s}} to {{sv-obs noun form}} outside of template"
                    % tn))

        if tn in ca_templates_with_val and tn in all_templates:
            val = getparam(t, "val")
            val2 = getparam(t, "val2")
            if val:
                origt = unicode(t)
                rmparam(t, "val")
                rmparam(t, "val2")
                newt = "%s {{ca-val|%s%s}}" % (unicode(t), val,
                                               "|" + val2 if val2 else "")
                templates_to_replace.append(
                    (origt, newt,
                     "move val= in {{%s}} to {{ca-val}} outside of template" %
                     tn))

        if tn in nl_templates_with_comp_of_sup_of and tn in all_templates:
            comp_of = getparam(t, "comp-of")
            sup_of = getparam(t, "sup-of")
            if comp_of:
                comp_of = ", the {{nc comp of|nl|%s}}" % comp_of
            if sup_of:
                sup_of = ", the {{nc sup of|nl|%s}}" % sup_of
            if comp_of or sup_of:
                origt = unicode(t)
                rmparam(t, "comp-of")
                rmparam(t, "sup-of")
                newt = "%s%s%s" % (unicode(t), comp_of, sup_of)
                templates_to_replace.append((
                    origt, newt,
                    "move comp-of=/sup-of== in {{%s}} to {{nc comp of}}/{{nc sup of}} outside of template"
                    % tn))

        if tn in el_templates_with_active and tn in all_templates:
            active = getparam(t, "active")
            ta = getparam(t, "ta")
            if active:
                origt = unicode(t)
                rmparam(t, "active")
                rmparam(t, "ta")
                newt = "%s, {{nc pass of|el|%s%s}}" % (
                    unicode(t), active, "|t=" + ta if ta else "")
                templates_to_replace.append((
                    origt, newt,
                    "move active= in {{%s}} to {{nc pass of}} outside of template"
                    % tn))

        if tn in el_templates_to_move_dot and tn in all_templates:
            origt = unicode(t)
            nodot = getparam(t, "nodot")
            rmparam(t, "nodot")  # in case it's blank
            if nodot:
                templates_to_replace.append(
                    (origt, unicode(t),
                     "remove nodot= from {{%s}}, with changed semantics" % tn))
            else:
                newt = "%s." % unicode(t)
                templates_to_replace.append((
                    origt, newt,
                    "add explicit final period to {{%s}} when nodot= not specified, due to change in semantics"
                    % tn))

    for curr_template, repl_template, note in templates_to_replace:
        text, replaced = blib.replace_in_text(text, curr_template,
                                              repl_template, pagemsg)
        if replaced:
            notes.append(note)

    return text, notes
Пример #8
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    #if ":" in pagetitle and not re.search("^(Appendix|Reconstruction|Citations):", pagetitle):
    #  return

    origtext = text
    notes = []
    removed_cats = []

    def should_remove_cat(cat):
        return re.match(args.regex + "$", cat.replace("_", " "))

    parsed = blib.parse_text(text)

    text_to_remove = []
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in topics_templates or tn in catlangname_templates or tn in categorize_templates:
            lang = getparam(t, "1").strip()
            cats = []
            for paramno in xrange(2, 30):
                cat = getparam(t, str(paramno)).strip()
                if cat:
                    cats.append(cat)
            filtered_cats = []
            for cat in cats:
                if tn in topics_templates:
                    full_cat = "%s:%s" % (lang, cat)
                elif tn in categorize_templates:
                    full_cat = cat
                else:
                    if lang not in blib.languages_byCode:
                        pagemsg(
                            "WARNING: Saw unrecognized language code '%s'" %
                            lang)
                        return
                    else:
                        full_cat = "%s %s" % (
                            blib.languages_byCode[lang]["canonicalName"], cat)
                if should_remove_cat(full_cat):
                    if full_cat not in removed_cats:
                        removed_cats.append(full_cat)
                else:
                    filtered_cats.append(cat)
            if cats == filtered_cats:
                continue
            non_numbered_params = []
            for param in t.params:
                pname = unicode(param.name).strip()
                pval = unicode(param.value).strip()
                showkey = param.showkey
                if not re.search("^[0-9]+$", pname):
                    non_numbered_params.append((pname, pval, showkey))
            if filtered_cats:
                origt = unicode(t)
                # Erase all params.
                del t.params[:]
                # Put back new params.
                t.add("1", lang)
                for catind, cat in enumerate(filtered_cats):
                    t.add(str(catind + 2), cat)
                for pname, pval, showkey in non_numbered_params:
                    t.add(pname, pval, showkey=showkey, preserve_spacing=False)
                if origt != unicode(t):
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                text_to_remove.append(unicode(t))
    text = unicode(parsed)

    for m in re.finditer(r"\[\[(?:Category|category|CAT):(.*?)\]\]\n?", text):
        cat = m.group(1)
        cat = re.sub(r"\|.*", "", cat)
        if should_remove_cat(cat):
            text_to_remove.append(m.group(0))
            if m.group(1) not in removed_cats:
                removed_cats.append(m.group(1))

    for remove_it in text_to_remove:
        text, did_replace = blib.replace_in_text(text,
                                                 remove_it,
                                                 "",
                                                 pagemsg,
                                                 no_found_repl_check=True)
        if not did_replace:
            return
        pagemsg("Removed %s" % remove_it.replace("\n", r"\n"))

    text = re.sub(r"\n\n+", "\n\n", text)
    if removed_cats:
        notes.append("remove categories: %s" % ",".join(removed_cats))
    if text != origtext and not notes:
        notes.append("condense 3+ newlines")
    return text, notes