Exemplo n.º 1
0
def clean_wikicode(wc: Wikicode):
    """
    Performs operations on each etymology section that get rid of extraneous nodes
    and create new templates based on natural-language parsing.
    """
    cleaner = lambda x: ((not isinstance(x, (Text, Wikilink, Template))) or
                         (isinstance(x, Text) and not bool(x.value.strip())))
    for node in wc.filter(recursive=False, matches=cleaner):
        wc.remove(node)

    merge_etyl_templates(wc)
    get_plus_combos(wc)
    get_comma_combos(wc)
    get_from_chains(wc)
    remove_links(wc)
Exemplo n.º 2
0
    def parse_section(self, code: Wikicode, section: Wikicode):
        for arg in section.filter(recursive=False):
            typ = type(arg)
            if typ in ignore_types:
                continue
            elif typ == Template:
                self.apply_wikitext(arg.name)
                name = str(arg.name).strip()
                if name in self.state.page_parser.ignore_templates:
                    continue
                name = self.to_template_name(name)
                if name in self.state.page_parser.ignore_templates:
                    continue
                if name in self.state.page_parser.ignore_pages_if_template:
                    return None  # ignore these pages

                root_match = self.state.page_parser.re_root_templates_full_str.match(
                    name)
                if root_match or (
                        self.state.page_parser.re_template_names and
                        self.state.page_parser.re_template_names.match(name)):
                    # Remove well-known params
                    for param in list(arg.params):
                        param_name = str(param.name)
                        for re_param in self.state.page_parser.re_well_known_parameters:
                            m = re_param.match(param_name)
                            if not m:
                                continue
                            extras = ''
                            has_templates = False
                            for arg2 in param.value.filter(recursive=False):
                                arg2type = type(arg2)
                                if arg2type == Text:
                                    extras += arg2.value
                                elif arg2type == Template and str(
                                        arg2.name
                                ) in self.state.page_parser.root_templates:
                                    has_templates = True
                                elif arg2type == Wikilink:
                                    extras += str(
                                        arg2.text) if arg2.text else str(
                                            arg2.title)
                                elif arg2type != Comment:
                                    raise ValueError(
                                        f"cannot parse well known param {str(param).strip()}"
                                    )
                            extras = extras.strip()
                            if has_templates and extras != '':
                                allowed_extras = self.state.page_parser.re_allowed_extras
                                if not allowed_extras or not allowed_extras.match(
                                        extras):
                                    raise ValueError(
                                        f"well known param '{str(param).strip()}' has text and templates"
                                    )
                            if has_templates:
                                self.parse_section(code, param.value)
                            elif extras:
                                self.state.add_result('_' + m.group(1),
                                                      param.value.strip())
                            arg.remove(param)
                    if root_match:
                        self.state.add_result(name, params_to_dict(arg.params))
                        code.remove(arg)
                    else:
                        new_arg = self.apply_value(code, arg)
                        if new_arg:
                            self.parse_section(code, new_arg)

                elif not self.state.page_parser.re_ignore_template_prefixes.match(
                        name):
                    self.warn(
                        f"{self.state.header} {self.word}: Unknown template {arg}, "
                        f"consider adding it to ignore_templates")
            elif typ == Heading:
                if len(self.state.header) < arg.level - 2:
                    self.state.header += [None] * (arg.level - 2 -
                                                   len(self.state.header))
                else:
                    self.state.header = self.state.header[:arg.level - 2]
                self.apply_wikitext(arg.title)
                template = None
                templates = arg.title.filter_templates(recursive=False)
                if len(templates) == 1:
                    name = str(templates[0].name).strip()
                    if name in self.state.page_parser.meaning_headers:
                        template = {name: params_to_dict(templates[0].params)}
                        code.remove(templates[0])
                if templates and not template:
                    print(
                        f"{self.state.header} {self.word} unrecognized header template in {arg.title}"
                    )
                text = str(arg.title).strip()
                if template:
                    if text:
                        print(
                            f"{self.state.header} {self.word} has text '{text}' in addition to template {template}"
                        )
                        template['text'] = text
                    self.state.header.append(template)
                else:
                    self.state.header.append(text)
            else:
                self.warn(f"{self.state.header} {self.word}: Ha? {typ}  {arg}")
Exemplo n.º 3
0
 def apply_wikitext(self, code: Wikicode):
     if code:
         # print(str(code).replace('\n', '\\n')[:100])
         for arg in code.filter(recursive=False):
             self.apply_value(code, arg)