Пример #1
0
        def act_on_dict(self, obj):
            if lookup(obj, "type") in BOX_TEMPLATES:
                if self._outer_boxes:
                    message = "Box {} inside {} is not allowed" \
                              .format(obj["type"], self._outer_boxes)

                    return {"type": "error", "message": message}
                else:
                    self._outer_boxes.append(obj["type"])

                    result = super().act_on_dict(obj)

                    self._outer_boxes.pop()

                    return result
            elif (self._outer_boxes or self._in_question) and \
                    lookup(obj, "type") == "image" and obj["thumbnail"]:
                return None
            elif lookup(obj, "type") == "question":
                if self._outer_boxes:
                    message = "Box {} inside {} is not allowed" \
                              .format(obj["type"], self._outer_boxes)

                    return {"type": "error", "message": message}
                self._in_question = True

                result = super().act_on_dict(obj)

                self._in_question = False

                return result
            else:
                return super().act_on_dict(obj)
Пример #2
0
def parse_sitemap(sitemap_text):
    """Parse the sitemap and returns a JSON object representing it.

    Arguments:
        sitemap_text -- content of the sitemap (a string)
    """
    root = {"children": [], "depth": 0, "code": "Mathe für Nicht-Freaks"}

    (introduction, separator,
     stripped_sitemap_text) = sitemap_text.partition(SITEMAP_DELIMITER)
    last_node = None

    for node in generate_sitemap_nodes(stripped_sitemap_text):
        if lookup(node, "type") == "annotation":
            assert last_node

            an_type = node["annotation_type"]

            if an_type not in last_node:
                last_node[an_type] = []

            last_node[an_type].append(node["value"])
        else:
            last_node = node
            insert_node(root, node)

    return parse_sitemap_node_codes(root)
Пример #3
0
 def transform_element(self, obj):
     if lookup(obj, "attrs", "typeof") in ("mw:Extension/ref",
                                           "mw:Extension/references"):
         # TODO: Proper parsing of references
         return None
     else:
         raise NotInterested()
Пример #4
0
        def change_block(self, obj, i, n):
            result = self.change_inline(obj, i, n)

            if lookup(result, "data") == " ":
                return None
            else:
                return result
Пример #5
0
    def test_lookup(self):
        obj = {"a": [23, 42], "b": {"e": [74]}, "c": True}

        self.assertEqual(lookup(obj, "a", 0), 23)
        self.assertEqual(lookup(obj, "b", "e", 0), 74)
        self.assertEqual(lookup(obj, "c"), True)

        self.assertDictEqual(lookup(obj), obj)
        self.assertListEqual(lookup(obj, "a"), [23, 42])

        self.assertIsNone(lookup(obj, 42))
        self.assertIsNone(lookup(obj, "a", 42))
        self.assertIsNone(lookup(obj, "c", "c"))
        self.assertIsNone(lookup(obj, "b", "e", 0, 0))
Пример #6
0
 def export_question(self, question, out):
     mdframed_options = ([
         "style=semanticbox,frametitleaboveskip=3pt,innerbottommargin=3pt"
     ] + (["frametitle=Frage"] if not lookup(question, "questiontype") else
          ["frametitle={" + question["questiontype"] + "}"]))
     with LatexEnvironment(out, "mdframed", mdframed_options):
         self(question["question"], out)
     with LatexEnvironment(out, "answer*"):
         self(question["answer"], out)
Пример #7
0
 def export_section(self, section, out):
     title_prefix = lookup(section, "title", 0, "data")
     if title_prefix and title_prefix.startswith("Baustelle: "):
         return
     section_types = ["section", "subsection", "subsubsection", "paragraph"]
     out.write("\\" + section_types[section["depth"] - 1] + "{")
     self(section["title"], out)
     out.write("}\n\n")
     self(section["content"], out)
Пример #8
0
        def act_on_list(self, lst):
            if any((lookup(x, "name") in HTML_INLINE_ELEMENTS for x in lst)):
                func = self.change_inline
            else:
                func = self.change_block

            # Necessary because the header includes a <span> because of
            # calling {{DISPLAYTITLE:...}} which should not happen. This
            # triggers that the root content is handled as inline mode which
            # should not happen.
            # TODO: Find a better solution
            if any((lookup(x, "name") == "p" for x in lst)):
                func = self.change_block

            result = (func(x, i, len(lst)) for x, i in zip(lst, count()))
            result = [x for x in result if x is not None]

            return result
Пример #9
0
        def transform_dict(self, obj):
            check(obj, "type") == "element"
            check(obj, "name") == "table"

            content = obj["children"]

            if lookup(content, 0, "name") == "tbody":
                content = content[0]["children"]

            return {"type": "table", "content": self(content)}
Пример #10
0
        def change_inline(self, obj, i, n):
            if lookup(obj, "type") == "text":
                data = re.sub(r"\s+(?=\s)", "", obj["data"])
                data = re.sub(r"\s", " ", data)
                if "\n" in data:
                    print(repr(data))

                if i == 0:
                    data = data.lstrip()

                if i == n - 1:
                    data = data.rstrip()

                if data:
                    return merge(obj, {"data": data})
                else:
                    return None
            else:
                return self(obj)
Пример #11
0
        def transform_element(self, obj):
            if lookup(obj, "attrs", "about") in self._template_ids:
                return None

            check(obj, "attrs", "typeof").of([
                "mw:Transclusion", "mw:Transclusion mw:Video/Thumb",
                "mw:Transclusion mw:Image"
            ])

            template = json.loads(obj["attrs"]["data-mw"])["parts"][0]

            try:
                template = template["template"]
            except (TypeError, KeyError):
                return {
                    "type": "error",
                    "message": "Template spans over several HTML elements."
                }

            name = template["target"]["wt"].strip()

            # labeled section transclusion needs unchanged case.
            if not name.startswith("#lst:"):
                name = name.lower()

            if name != "(!":
                # Template includes a table afterwards
                self._template_ids.add(obj["attrs"]["about"])

            name = remove_prefix(name, ":mathe für nicht-freaks: vorlage:")

            params = template["params"]
            params = {k: v["wt"] for k, v in params.items()}
            params = {key: self.parse_parameter_value(name, key, value) \
                        for key, value in params.items()
                        if not params.get(key + "-noprint", False)}

            # TODO: Find better solution
            if params.get("noprint", False):
                return None

            return {"type": "template", "name": name, "params": params}
Пример #12
0
 def __init__(self, *args):
     self._res = lookup(*args)
Пример #13
0
 def transform_question(self, obj):
     if lookup(obj, "questiontype") == "Verständnisfrage":
         return None
     else:
         raise NotInterested()
Пример #14
0
        def transform_template(self, obj):
            for bname, tname, param_names in BOXSPEC:
                if obj["name"] == tname:
                    params = {
                        k: self(obj["params"].get(v, None))
                        for k, v in param_names.items()
                    }

                    return merge(params, {"type": bname})

            if obj["name"] == "liste":
                if "liste" in obj["params"]:
                    sublist = obj["params"]["liste"][0]

                    assert sublist["type"] == "list"

                    items = sublist["items"]
                    ordered = sublist["ordered"]
                else:
                    items = [{
                        "type": "listitem",
                        "content": self(x)
                    } for x in obj["params"]["item_list"]]
                    ordered = obj["params"].get("type", "") == "ol"

                return {
                    "type": "list",
                    "items": items,
                    "ordered": ordered,
                    "spacing": obj["params"].get("abstand", None)
                }
            elif obj["name"] == "formel":
                formula = obj["params"].get("1", [])

                if len(formula) == 1 and \
                        lookup(formula, 0, "type") == "inlinemath":
                    formula = formula[0]["formula"]
                    if formula.startswith(
                            "\\begin{align}") and formula.endswith(
                                "\\end{align}"):
                        formula = remove_prefix(formula, "\\begin{align}")
                        formula = remove_suffix(formula, "\\end{align}")
                    formula = "\\begin{align}" + formula + "\\end{align}"
                    return {"type": "equation", "formula": formula}
                else:
                    message = "Wrong formatted equation"
                    details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)"

                    log_parser_error(message, obj, details,
                                     self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "(!":
                return None
            elif obj["name"].startswith("#invoke:"):
                # Template is header or footer
                return None
            elif obj["name"] == "noprint":
                return None
            elif obj["name"] == "todo":
                message = "Todo-Message in MediaWiki code."
                details = "Check if this TODO shoud be completed for a book release."
                log_parser_error(message, obj, details, self.current_section)

                return {"type": "error", "message": message}
            else:
                message = "Parsing of template `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "target": obj,
                    "message": message
                }
Пример #15
0
        def transform_element(self, obj):
            if obj["name"] == "p":
                return {"type": "paragraph", "content": self(obj["children"])}
            elif obj["name"] == "br":
                message = "<br> not allowed"
                log_parser_error(message, obj, position=self.current_section)
                return {"type": "error", "message": message}
            elif obj["name"] == "dfn":
                return {"type": "i", "content": self(obj["children"])}
            elif obj["name"] in ("i", "b", "th", "tr", "td"):
                return {"type": obj["name"], "content": self(obj["children"])}
            elif obj["name"] in ("h2", "h3"):
                return {
                    "type": "header",
                    # Header begin with h2 in our project -> subtract 1
                    "depth": int(obj["name"][-1]) - 1,
                    "content": self(obj["children"])
                }
            elif obj["name"] == "a":
                url = obj["attrs"].get("href", "")

                if url:
                    if url.startswith("./"):
                        # TODO: The URL prefix should not be hardcoded here
                        url = "https://de.wikibooks.org/wiki/" + url[2:]

                    assert url.startswith("http://") \
                        or url.startswith("https://")

                    return {
                        "type": "href",
                        "url": url,
                        "content": self(obj["children"])
                    }
                else:
                    message = "<a> tag without `href` url"
                    log_parser_error(message,
                                     obj,
                                     position=self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "del":
                return {
                    "type": "strikethrough",
                    "content": self(obj["children"])
                }
            elif obj["name"] == "blockquote":
                return {"type": "blockquote", "content": self(obj["children"])}

            elif lookup(obj, "attrs", "typeof") == "mw:Video/Thumb":
                # TODO: Proper parsing of videos
                return None
            elif lookup(obj, "attrs", "typeof") == "mw:Extension/section":
                data = json.loads(obj["attrs"]["data-mw"])

                assert data["name"] == "section"

                if "begin" in data["attrs"]:
                    return {
                        "type": "section_start",
                        "name": data["attrs"]["begin"]
                    }
                elif "end" in data["attrs"]:
                    return {
                        "type": "section_end",
                        "name": data["attrs"]["end"]
                    }
                else:
                    return {
                        "type": "error",
                        "message": "section must be either start or end."
                    }
            elif obj["name"] in ("h1", "h4", "h5", "h6"):
                message = "Heading of depth {} is not allowed".format(
                    obj["name"][1:])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "error",
                    "message": message.format(int(obj["name"][-1]))
                }
            elif lookup(obj, "attrs", "typeof") == "mw:Entity":
                # TODO: Are there other entities?
                return {"type": "entity", "kind": " "}
            elif (obj["name"] == "span" and lookup(obj, "attrs", "typeof")
                  == "mw:DisplaySpace mw:Placeholder"):
                msg = "Spans with type {} are not allowed".format(
                    lookup(obj, "attrs", "typeof"))
                log_parser_error(msg, obj, position=self.current_section)
                return {"type": "error", "message": msg}
            else:
                message = "Parsing of HTML element `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "message": message,
                    "target": obj
                }