Пример #1
0
def get_license_text(license, name):
    raw_name = name.strip()
    name = remove_prefix(remove_prefix(escape_latex(name).strip(), "File:"),
                         "Datei:")
    name = name.replace("\\_", "\\allowbreak\\_")
    name = name.replace("-", "\\allowbreak-")
    if license:
        return "Abb. \\arabic{imagelabel}: \\protect\\href{https://commons.wikimedia.org/wiki/%s}{\\textbf{%s}} by " % (
            raw_name, name) + ", ".join(
                license["authors"]
            ) + " \\textit{(" + license["shortname"] + ")}"
    else:
        return "Abb. \\arabic{imagelabel}: could not get licensing information!"
Пример #2
0
        def transform_template(self, obj):
            if obj["name"].startswith("#lst:"):
                article_name = remove_prefix(obj["name"], "#lst:")
                article = self.api.get_content(article_name)

                section_name = obj["params"]["1"]
                begin = r"\<section\s+begin\=[\"']?" + re.escape(section_name) + \
                        r"[\"']?\s*\/>"
                end = r"\<section\s+end\=[\"']?" + re.escape(section_name) + \
                        r"[\"']?\s*\/\>"

                section = re.search(begin + "(.*)" + end, article, re.DOTALL)

                if section:
                    section = section.group(1).strip()
                    content = parse_content(self.api, self.title, section)

                    return {"type": "included_section", "content": content}
                else:
                    message = "section '{}' of '{}' cannot be included" \
                              .format(section_name, article_name)

                    return {"type": "error", "message": message}
            else:
                raise NotInterested()
Пример #3
0
        def normalize(self, obj, mode):
            try:
                formula = self.api.normalize_formula(obj["formula"], mode)

                if mode == "tex":
                    formula = remove_prefix(formula, "{\\begin{aligned}")
                    formula = remove_suffix(formula, "\\end{aligned}}")
            except ValueError:
                message = "Wrong formatted formula"
                # TODO: current_section was not set for this class
                log_parser_error(message, obj)
                return {"type": "error", "message": message}

            return merge(obj, {"formula": formula})
Пример #4
0
def generate_sitemap_nodes(sitemap_text):
    """Generator for all node specifications in a sitemap source code. It
    yields dictionaries of the form:

        { "code": code, "depth": depth, "children": [] }

    Thereby `code` is a string representation of the node and `depth` is a
    number corresponding to the node's depth. The higher the depth is, the
    deeper the node need to be included in the final tree.
    """
    # In MediaWiki the maximal depth of a headline is 6 (as in HTML).
    # For list elements this maximal header depth is added so that list
    # elements will always be included under a headline node.
    max_headline_depth = 6

    headline_re = re.compile(
        r"""(={1,%s}) # Equal signs of the headline
                                 (.*)      # code defining the node
                                 \1        # Repeatation of the equal signs
                              """ % max_headline_depth, re.X)

    list_re = re.compile(
        r"""([*]+) # asteriks of a list element
                             (.*)   # code defining a sitemap node
                          """, re.X)

    for line in sitemap_text.splitlines():
        for regex, depth_start in ((headline_re, 0), (list_re,
                                                      max_headline_depth)):
            match = regex.fullmatch(line.strip())

            if match:
                yield {
                    "code": match.group(2).strip(),
                    "depth": depth_start + len(match.group(1)),
                    "children": []
                }

        for an_type, an_prefix in SITEMAP_ANNOTATIONS.items():
            if line.startswith(an_prefix):
                value = remove_prefix(line, an_prefix).strip()

                yield {
                    "type": "annotation",
                    "value": value,
                    "annotation_type": an_type
                }
Пример #5
0
        def transform_element(self, obj):
            if lookup(obj, "attrs", "about") in self._template_ids:
                return None

            check(obj, "attrs", "typeof").of([
                "mw:Transclusion", "mw:Transclusion mw:Video/Thumb",
                "mw:Transclusion mw:Image"
            ])

            template = json.loads(obj["attrs"]["data-mw"])["parts"][0]

            try:
                template = template["template"]
            except (TypeError, KeyError):
                return {
                    "type": "error",
                    "message": "Template spans over several HTML elements."
                }

            name = template["target"]["wt"].strip()

            # labeled section transclusion needs unchanged case.
            if not name.startswith("#lst:"):
                name = name.lower()

            if name != "(!":
                # Template includes a table afterwards
                self._template_ids.add(obj["attrs"]["about"])

            name = remove_prefix(name, ":mathe für nicht-freaks: vorlage:")

            params = template["params"]
            params = {k: v["wt"] for k, v in params.items()}
            params = {key: self.parse_parameter_value(name, key, value) \
                        for key, value in params.items()
                        if not params.get(key + "-noprint", False)}

            # TODO: Find better solution
            if params.get("noprint", False):
                return None

            return {"type": "template", "name": name, "params": params}
Пример #6
0
 def test_remove_prefix(self):
     self.assertEqual(remove_prefix("aa", "a"), "a")
     self.assertEqual(remove_prefix("aa", ""), "aa")
     self.assertEqual(remove_prefix("aa", "aaa"), "aa")
     self.assertEqual(remove_prefix("a 4 2", "a 4"), " 2")
     self.assertEqual(remove_prefix("", "a 4"), "")
Пример #7
0
        def transform_template(self, obj):
            for bname, tname, param_names in BOXSPEC:
                if obj["name"] == tname:
                    params = {
                        k: self(obj["params"].get(v, None))
                        for k, v in param_names.items()
                    }

                    return merge(params, {"type": bname})

            if obj["name"] == "liste":
                if "liste" in obj["params"]:
                    sublist = obj["params"]["liste"][0]

                    assert sublist["type"] == "list"

                    items = sublist["items"]
                    ordered = sublist["ordered"]
                else:
                    items = [{
                        "type": "listitem",
                        "content": self(x)
                    } for x in obj["params"]["item_list"]]
                    ordered = obj["params"].get("type", "") == "ol"

                return {
                    "type": "list",
                    "items": items,
                    "ordered": ordered,
                    "spacing": obj["params"].get("abstand", None)
                }
            elif obj["name"] == "formel":
                formula = obj["params"].get("1", [])

                if len(formula) == 1 and \
                        lookup(formula, 0, "type") == "inlinemath":
                    formula = formula[0]["formula"]
                    if formula.startswith(
                            "\\begin{align}") and formula.endswith(
                                "\\end{align}"):
                        formula = remove_prefix(formula, "\\begin{align}")
                        formula = remove_suffix(formula, "\\end{align}")
                    formula = "\\begin{align}" + formula + "\\end{align}"
                    return {"type": "equation", "formula": formula}
                else:
                    message = "Wrong formatted equation"
                    details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)"

                    log_parser_error(message, obj, details,
                                     self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "(!":
                return None
            elif obj["name"].startswith("#invoke:"):
                # Template is header or footer
                return None
            elif obj["name"] == "noprint":
                return None
            elif obj["name"] == "todo":
                message = "Todo-Message in MediaWiki code."
                details = "Check if this TODO shoud be completed for a book release."
                log_parser_error(message, obj, details, self.current_section)

                return {"type": "error", "message": message}
            else:
                message = "Parsing of template `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "target": obj,
                    "message": message
                }
Пример #8
0
def canonical_image_name(name):
    name = remove_prefix(name, "./")
    name = remove_prefix(name, "Datei:")
    name = remove_prefix(name, "File:")

    return "File:" + name