示例#1
0
def combine_template_chains(wc: Wikicode, new_template_name: str,
                            template_indices: List[int],
                            text_indices: List[int]) -> None:
    """
    Helper function for combining templates that are linked via free text into
    a structured template hierarchy.
    """
    index_combos = []

    index_combo = []
    combine = False
    for i in template_indices:
        if (i + 1 in text_indices) or (i - 2 in index_combo and combine):
            index_combo.append(i)

        combine = i + 1 in text_indices
        if not combine:
            if len(index_combo) > 1:
                index_combos.append(index_combo)
            index_combo = []

    if len(index_combo) > 1:
        index_combos.append(index_combo)

    combo_nodes = [[wc.nodes[i] for i in chain] for chain in index_combos]

    for combo in combo_nodes:
        params = [
            Parameter(str(i + 1), t, showkey=False)
            for i, t in enumerate(combo)
        ]
        new_template = Template(new_template_name, params=params)
        wc.insert_before(combo[0], new_template, recursive=False)
        for node in combo:
            wc.remove(node, recursive=False)
示例#2
0
 def repl_conditional(self, arg: Template, code: Wikicode,
                      index: Union[str, int]):
     if arg.has(index):
         param = arg.get(index)
         self.apply_wikitext(param.value)
         code.replace(
             arg,
             str(param.value).strip() if param.showkey else param.value)
     else:
         code.remove(arg)
示例#3
0
def merge_etyl_templates(wc: Wikicode) -> Wikicode:
    """
    Given a chunk of wikicode, finds instances where the deprecated `etyl` template is immediately followed by
    either a word in free text, a linked word, or a generic `mention`/`link`/`langname-mention` template.
    It replaces this pattern with a new `derived-parsed` template -- meaning the same thing as the `derived` template
    but namespaced to differentiate. For cases where the `mention` language is different from the `etyl` language,
    we use the former. The template is removed if we can't parse it effectively.
    """
    etyl_indices = [
        i for i, node in enumerate(wc.nodes) if isinstance(node, Template)
        and node.name == "etyl" and i < len(wc.nodes) - 1
    ]

    nodes_to_remove = []
    for i in etyl_indices:
        make_new_template = False
        etyl: Template = wc.nodes[i]
        related_language = etyl.params[0]
        if len(etyl.params) == 1:
            language = "en"
        else:
            language = etyl.params[1]
        node = wc.nodes[i + 1]
        if isinstance(node, Text):
            val = re.split(",| |", node.value.strip())[0]
            if val:
                make_new_template = True
        elif isinstance(node, Wikilink):
            val = node.text or node.title
            val = re.split(",| |", val.strip())[0]
            if val:
                make_new_template = True
        elif isinstance(node, Template):
            if node.name in ("m", "mention", "m+", "langname-mention", "l",
                             "link"):
                related_language = node.params[0]
                if len(node.params) > 1:
                    val = node.params[1].value
                    make_new_template = True
                    nodes_to_remove.append(node)

        if make_new_template:
            params = [
                Parameter(str(i + 1), str(param), showkey=False)
                for i, param in enumerate([language, related_language, val])
            ]
            new_template = Template("derived-parsed", params=params)
            wc.replace(etyl, new_template, recursive=False)
        else:
            nodes_to_remove.append(etyl)

    for node in nodes_to_remove:
        wc.remove(node, recursive=False)
    return wc
示例#4
0
def flag_template(self: TemplateParser,
                  code: Wikicode,
                  template: Template,
                  flag,
                  index=None):
    if index and template.has(index):
        param = template.get(index)
        self.apply_wikitext(param.value)
        code.replace(template, param)
    else:
        code.remove(template)
    self.state.flags.add(flag)
示例#5
0
def clean_wikicode(wc: Wikicode):
    """
    Performs operations on each etymology section that get rid of extraneous nodes
    and create new templates based on natural-language parsing.
    """
    cleaner = lambda x: ((not isinstance(x, (Text, Wikilink, Template))) or
                         (isinstance(x, Text) and not bool(x.value.strip())))
    for node in wc.filter(recursive=False, matches=cleaner):
        wc.remove(node)

    merge_etyl_templates(wc)
    get_plus_combos(wc)
    get_comma_combos(wc)
    get_from_chains(wc)
    remove_links(wc)
示例#6
0
    def parse_section(self, code: Wikicode, section: Wikicode):
        for arg in section.filter(recursive=False):
            typ = type(arg)
            if typ in ignore_types:
                continue
            elif typ == Template:
                self.apply_wikitext(arg.name)
                name = str(arg.name).strip()
                if name in self.state.page_parser.ignore_templates:
                    continue
                name = self.to_template_name(name)
                if name in self.state.page_parser.ignore_templates:
                    continue
                if name in self.state.page_parser.ignore_pages_if_template:
                    return None  # ignore these pages

                root_match = self.state.page_parser.re_root_templates_full_str.match(
                    name)
                if root_match or (
                        self.state.page_parser.re_template_names and
                        self.state.page_parser.re_template_names.match(name)):
                    # Remove well-known params
                    for param in list(arg.params):
                        param_name = str(param.name)
                        for re_param in self.state.page_parser.re_well_known_parameters:
                            m = re_param.match(param_name)
                            if not m:
                                continue
                            extras = ''
                            has_templates = False
                            for arg2 in param.value.filter(recursive=False):
                                arg2type = type(arg2)
                                if arg2type == Text:
                                    extras += arg2.value
                                elif arg2type == Template and str(
                                        arg2.name
                                ) in self.state.page_parser.root_templates:
                                    has_templates = True
                                elif arg2type == Wikilink:
                                    extras += str(
                                        arg2.text) if arg2.text else str(
                                            arg2.title)
                                elif arg2type != Comment:
                                    raise ValueError(
                                        f"cannot parse well known param {str(param).strip()}"
                                    )
                            extras = extras.strip()
                            if has_templates and extras != '':
                                allowed_extras = self.state.page_parser.re_allowed_extras
                                if not allowed_extras or not allowed_extras.match(
                                        extras):
                                    raise ValueError(
                                        f"well known param '{str(param).strip()}' has text and templates"
                                    )
                            if has_templates:
                                self.parse_section(code, param.value)
                            elif extras:
                                self.state.add_result('_' + m.group(1),
                                                      param.value.strip())
                            arg.remove(param)
                    if root_match:
                        self.state.add_result(name, params_to_dict(arg.params))
                        code.remove(arg)
                    else:
                        new_arg = self.apply_value(code, arg)
                        if new_arg:
                            self.parse_section(code, new_arg)

                elif not self.state.page_parser.re_ignore_template_prefixes.match(
                        name):
                    self.warn(
                        f"{self.state.header} {self.word}: Unknown template {arg}, "
                        f"consider adding it to ignore_templates")
            elif typ == Heading:
                if len(self.state.header) < arg.level - 2:
                    self.state.header += [None] * (arg.level - 2 -
                                                   len(self.state.header))
                else:
                    self.state.header = self.state.header[:arg.level - 2]
                self.apply_wikitext(arg.title)
                template = None
                templates = arg.title.filter_templates(recursive=False)
                if len(templates) == 1:
                    name = str(templates[0].name).strip()
                    if name in self.state.page_parser.meaning_headers:
                        template = {name: params_to_dict(templates[0].params)}
                        code.remove(templates[0])
                if templates and not template:
                    print(
                        f"{self.state.header} {self.word} unrecognized header template in {arg.title}"
                    )
                text = str(arg.title).strip()
                if template:
                    if text:
                        print(
                            f"{self.state.header} {self.word} has text '{text}' in addition to template {template}"
                        )
                        template['text'] = text
                    self.state.header.append(template)
                else:
                    self.state.header.append(text)
            else:
                self.warn(f"{self.state.header} {self.word}: Ha? {typ}  {arg}")
示例#7
0
 def apply_value(self, code: Wikicode, arg: Union[Node, Template]):
     typ = type(arg)
     if typ == Text:
         return
     elif typ == Argument:
         self.apply_wikitext(arg.name)
         arg_name = str(arg.name)
         if arg_name in self.arguments:
             code.replace(arg, self.arguments[arg_name])
         elif arg.default is not None:
             self.apply_wikitext(arg.default)
             code.replace(arg, str(arg.default).strip())
     elif typ == Template:
         self.apply_wikitext(arg.name)
         name = self.to_template_name(str(arg.name).strip())
         if name == '':
             self.warn(f"Template name is blank in {arg}")
             code.remove(arg)
             return
         if name.startswith('safesubst:'):
             name = name[len('safesubst:'):].strip()
         if name.startswith('#'):
             if name.startswith('#if:'):
                 self.repl_conditional(
                     arg, code, 2 if len(arg.name.nodes) == 1
                     or arg.name.get(1).strip() == '' else 1)
             elif name.startswith('#ifeq:'):
                 if not arg.has('1'):
                     code.remove(arg)
                 else:
                     val1 = name[len('#ifeq:'):].strip()
                     val2 = arg.get('1')
                     self.apply_wikitext(val2.value)
                     val2 = str(val2.value).strip()
                     self.repl_conditional(arg, code,
                                           3 if val1 == val2 else 2)
             elif name.startswith('#switch:'):
                 key = name[len('#switch:'):].strip()
                 if not arg.has(key):
                     key = '#default'
                     if not arg.has(key):
                         key = '1'
                         # if not arg.has(key):
                         #     self.warn(f'switch value "{key}" not found in {arg}')
                 self.repl_conditional(arg, code, key)
             elif name.startswith('#ifexist:'):
                 key = name[len('#ifexist:'):].strip().replace(
                     self.state.page_parser.template_ns, '').strip()
                 self.repl_conditional(
                     arg, code, 1 if key
                     in self.state.page_parser.templates_no_ns else 2)
             else:
                 raise ValueError(f'Unhandled special {name}')
         else:
             for param in arg.params:
                 self.apply_value(code, param)
             if name in custom_templates:
                 custom_templates[name](self, code, arg)
             elif (
                 (name in self.state.page_parser.expand_template
                  and not self.state.page_parser.expand_template[name](arg))
                     or name in self.state.page_parser.ignore_templates):
                 # self.warn(f"Template {name} should not be expanded")
                 return
             else:
                 template_page = self.state.get_template(name)
                 if template_page:
                     sub_template_params = params_to_dict(arg.params)
                     self.state.add_result('_' + name, sub_template_params)
                     new_text = TemplateParser(
                         f'{self.template_name}.{name}', self.word,
                         template_page.content, sub_template_params,
                         self.state).run()
                     new_arg = mw_parse(str(new_text).strip())
                     code.replace(arg, new_arg)
                     return new_arg
                 else:
                     self.warn(f"Template {name} is not known")
     elif typ == Parameter:
         self.apply_wikitext(arg.name)
         self.apply_wikitext(arg.value)
     elif typ == Tag:
         if str(arg.tag).strip() == 'noinclude':
             code.remove(arg)
         else:
             self.apply_wikitext(arg.contents)
     elif typ == Wikilink:
         self.apply_wikitext(arg.title)
         self.apply_wikitext(arg.text)
     elif typ == Heading:
         self.apply_wikitext(arg.title)
     elif typ == HTMLEntity:
         code.replace(arg, unescape(str(arg)))
     elif typ == Comment:
         code.remove(arg)
     elif typ == ExternalLink:
         return
     else:
         raise ValueError(f'Unknown type {typ} in {arg}')