def act_on_dict(self, obj): if lookup(obj, "type") in BOX_TEMPLATES: if self._outer_boxes: message = "Box {} inside {} is not allowed" \ .format(obj["type"], self._outer_boxes) return {"type": "error", "message": message} else: self._outer_boxes.append(obj["type"]) result = super().act_on_dict(obj) self._outer_boxes.pop() return result elif (self._outer_boxes or self._in_question) and \ lookup(obj, "type") == "image" and obj["thumbnail"]: return None elif lookup(obj, "type") == "question": if self._outer_boxes: message = "Box {} inside {} is not allowed" \ .format(obj["type"], self._outer_boxes) return {"type": "error", "message": message} self._in_question = True result = super().act_on_dict(obj) self._in_question = False return result else: return super().act_on_dict(obj)
def parse_sitemap(sitemap_text): """Parse the sitemap and returns a JSON object representing it. Arguments: sitemap_text -- content of the sitemap (a string) """ root = {"children": [], "depth": 0, "code": "Mathe für Nicht-Freaks"} (introduction, separator, stripped_sitemap_text) = sitemap_text.partition(SITEMAP_DELIMITER) last_node = None for node in generate_sitemap_nodes(stripped_sitemap_text): if lookup(node, "type") == "annotation": assert last_node an_type = node["annotation_type"] if an_type not in last_node: last_node[an_type] = [] last_node[an_type].append(node["value"]) else: last_node = node insert_node(root, node) return parse_sitemap_node_codes(root)
def transform_element(self, obj): if lookup(obj, "attrs", "typeof") in ("mw:Extension/ref", "mw:Extension/references"): # TODO: Proper parsing of references return None else: raise NotInterested()
def change_block(self, obj, i, n): result = self.change_inline(obj, i, n) if lookup(result, "data") == " ": return None else: return result
def test_lookup(self): obj = {"a": [23, 42], "b": {"e": [74]}, "c": True} self.assertEqual(lookup(obj, "a", 0), 23) self.assertEqual(lookup(obj, "b", "e", 0), 74) self.assertEqual(lookup(obj, "c"), True) self.assertDictEqual(lookup(obj), obj) self.assertListEqual(lookup(obj, "a"), [23, 42]) self.assertIsNone(lookup(obj, 42)) self.assertIsNone(lookup(obj, "a", 42)) self.assertIsNone(lookup(obj, "c", "c")) self.assertIsNone(lookup(obj, "b", "e", 0, 0))
def export_question(self, question, out): mdframed_options = ([ "style=semanticbox,frametitleaboveskip=3pt,innerbottommargin=3pt" ] + (["frametitle=Frage"] if not lookup(question, "questiontype") else ["frametitle={" + question["questiontype"] + "}"])) with LatexEnvironment(out, "mdframed", mdframed_options): self(question["question"], out) with LatexEnvironment(out, "answer*"): self(question["answer"], out)
def export_section(self, section, out): title_prefix = lookup(section, "title", 0, "data") if title_prefix and title_prefix.startswith("Baustelle: "): return section_types = ["section", "subsection", "subsubsection", "paragraph"] out.write("\\" + section_types[section["depth"] - 1] + "{") self(section["title"], out) out.write("}\n\n") self(section["content"], out)
def act_on_list(self, lst): if any((lookup(x, "name") in HTML_INLINE_ELEMENTS for x in lst)): func = self.change_inline else: func = self.change_block # Necessary because the header includes a <span> because of # calling {{DISPLAYTITLE:...}} which should not happen. This # triggers that the root content is handled as inline mode which # should not happen. # TODO: Find a better solution if any((lookup(x, "name") == "p" for x in lst)): func = self.change_block result = (func(x, i, len(lst)) for x, i in zip(lst, count())) result = [x for x in result if x is not None] return result
def transform_dict(self, obj): check(obj, "type") == "element" check(obj, "name") == "table" content = obj["children"] if lookup(content, 0, "name") == "tbody": content = content[0]["children"] return {"type": "table", "content": self(content)}
def change_inline(self, obj, i, n): if lookup(obj, "type") == "text": data = re.sub(r"\s+(?=\s)", "", obj["data"]) data = re.sub(r"\s", " ", data) if "\n" in data: print(repr(data)) if i == 0: data = data.lstrip() if i == n - 1: data = data.rstrip() if data: return merge(obj, {"data": data}) else: return None else: return self(obj)
def transform_element(self, obj): if lookup(obj, "attrs", "about") in self._template_ids: return None check(obj, "attrs", "typeof").of([ "mw:Transclusion", "mw:Transclusion mw:Video/Thumb", "mw:Transclusion mw:Image" ]) template = json.loads(obj["attrs"]["data-mw"])["parts"][0] try: template = template["template"] except (TypeError, KeyError): return { "type": "error", "message": "Template spans over several HTML elements." } name = template["target"]["wt"].strip() # labeled section transclusion needs unchanged case. if not name.startswith("#lst:"): name = name.lower() if name != "(!": # Template includes a table afterwards self._template_ids.add(obj["attrs"]["about"]) name = remove_prefix(name, ":mathe für nicht-freaks: vorlage:") params = template["params"] params = {k: v["wt"] for k, v in params.items()} params = {key: self.parse_parameter_value(name, key, value) \ for key, value in params.items() if not params.get(key + "-noprint", False)} # TODO: Find better solution if params.get("noprint", False): return None return {"type": "template", "name": name, "params": params}
def __init__(self, *args): self._res = lookup(*args)
def transform_question(self, obj): if lookup(obj, "questiontype") == "Verständnisfrage": return None else: raise NotInterested()
def transform_template(self, obj): for bname, tname, param_names in BOXSPEC: if obj["name"] == tname: params = { k: self(obj["params"].get(v, None)) for k, v in param_names.items() } return merge(params, {"type": bname}) if obj["name"] == "liste": if "liste" in obj["params"]: sublist = obj["params"]["liste"][0] assert sublist["type"] == "list" items = sublist["items"] ordered = sublist["ordered"] else: items = [{ "type": "listitem", "content": self(x) } for x in obj["params"]["item_list"]] ordered = obj["params"].get("type", "") == "ol" return { "type": "list", "items": items, "ordered": ordered, "spacing": obj["params"].get("abstand", None) } elif obj["name"] == "formel": formula = obj["params"].get("1", []) if len(formula) == 1 and \ lookup(formula, 0, "type") == "inlinemath": formula = formula[0]["formula"] if formula.startswith( "\\begin{align}") and formula.endswith( "\\end{align}"): formula = remove_prefix(formula, "\\begin{align}") formula = remove_suffix(formula, "\\end{align}") formula = "\\begin{align}" + formula + "\\end{align}" return {"type": "equation", "formula": formula} else: message = "Wrong formatted equation" details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)" log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} elif obj["name"] == "(!": return None elif obj["name"].startswith("#invoke:"): # Template is header or footer return None elif obj["name"] == "noprint": return None elif obj["name"] == "todo": message = "Todo-Message in MediaWiki code." details = "Check if this TODO shoud be completed for a book release." log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} else: message = "Parsing of template `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "target": obj, "message": message }
def transform_element(self, obj): if obj["name"] == "p": return {"type": "paragraph", "content": self(obj["children"])} elif obj["name"] == "br": message = "<br> not allowed" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message} elif obj["name"] == "dfn": return {"type": "i", "content": self(obj["children"])} elif obj["name"] in ("i", "b", "th", "tr", "td"): return {"type": obj["name"], "content": self(obj["children"])} elif obj["name"] in ("h2", "h3"): return { "type": "header", # Header begin with h2 in our project -> subtract 1 "depth": int(obj["name"][-1]) - 1, "content": self(obj["children"]) } elif obj["name"] == "a": url = obj["attrs"].get("href", "") if url: if url.startswith("./"): # TODO: The URL prefix should not be hardcoded here url = "https://de.wikibooks.org/wiki/" + url[2:] assert url.startswith("http://") \ or url.startswith("https://") return { "type": "href", "url": url, "content": self(obj["children"]) } else: message = "<a> tag without `href` url" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message} elif obj["name"] == "del": return { "type": "strikethrough", "content": self(obj["children"]) } elif obj["name"] == "blockquote": return {"type": "blockquote", "content": self(obj["children"])} elif lookup(obj, "attrs", "typeof") == "mw:Video/Thumb": # TODO: Proper parsing of videos return None elif lookup(obj, "attrs", "typeof") == "mw:Extension/section": data = json.loads(obj["attrs"]["data-mw"]) assert data["name"] == "section" if "begin" in data["attrs"]: return { "type": "section_start", "name": data["attrs"]["begin"] } elif "end" in data["attrs"]: return { "type": "section_end", "name": data["attrs"]["end"] } else: return { "type": "error", "message": "section must be either start or end." } elif obj["name"] in ("h1", "h4", "h5", "h6"): message = "Heading of depth {} is not allowed".format( obj["name"][1:]) log_parser_error(message, obj, position=self.current_section) return { "type": "error", "message": message.format(int(obj["name"][-1])) } elif lookup(obj, "attrs", "typeof") == "mw:Entity": # TODO: Are there other entities? return {"type": "entity", "kind": " "} elif (obj["name"] == "span" and lookup(obj, "attrs", "typeof") == "mw:DisplaySpace mw:Placeholder"): msg = "Spans with type {} are not allowed".format( lookup(obj, "attrs", "typeof")) log_parser_error(msg, obj, position=self.current_section) return {"type": "error", "message": msg} else: message = "Parsing of HTML element `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "message": message, "target": obj }