def get_license_text(license, name): raw_name = name.strip() name = remove_prefix(remove_prefix(escape_latex(name).strip(), "File:"), "Datei:") name = name.replace("\\_", "\\allowbreak\\_") name = name.replace("-", "\\allowbreak-") if license: return "Abb. \\arabic{imagelabel}: \\protect\\href{https://commons.wikimedia.org/wiki/%s}{\\textbf{%s}} by " % ( raw_name, name) + ", ".join( license["authors"] ) + " \\textit{(" + license["shortname"] + ")}" else: return "Abb. \\arabic{imagelabel}: could not get licensing information!"
def transform_template(self, obj): if obj["name"].startswith("#lst:"): article_name = remove_prefix(obj["name"], "#lst:") article = self.api.get_content(article_name) section_name = obj["params"]["1"] begin = r"\<section\s+begin\=[\"']?" + re.escape(section_name) + \ r"[\"']?\s*\/>" end = r"\<section\s+end\=[\"']?" + re.escape(section_name) + \ r"[\"']?\s*\/\>" section = re.search(begin + "(.*)" + end, article, re.DOTALL) if section: section = section.group(1).strip() content = parse_content(self.api, self.title, section) return {"type": "included_section", "content": content} else: message = "section '{}' of '{}' cannot be included" \ .format(section_name, article_name) return {"type": "error", "message": message} else: raise NotInterested()
def normalize(self, obj, mode): try: formula = self.api.normalize_formula(obj["formula"], mode) if mode == "tex": formula = remove_prefix(formula, "{\\begin{aligned}") formula = remove_suffix(formula, "\\end{aligned}}") except ValueError: message = "Wrong formatted formula" # TODO: current_section was not set for this class log_parser_error(message, obj) return {"type": "error", "message": message} return merge(obj, {"formula": formula})
def generate_sitemap_nodes(sitemap_text): """Generator for all node specifications in a sitemap source code. It yields dictionaries of the form: { "code": code, "depth": depth, "children": [] } Thereby `code` is a string representation of the node and `depth` is a number corresponding to the node's depth. The higher the depth is, the deeper the node need to be included in the final tree. """ # In MediaWiki the maximal depth of a headline is 6 (as in HTML). # For list elements this maximal header depth is added so that list # elements will always be included under a headline node. max_headline_depth = 6 headline_re = re.compile( r"""(={1,%s}) # Equal signs of the headline (.*) # code defining the node \1 # Repeatation of the equal signs """ % max_headline_depth, re.X) list_re = re.compile( r"""([*]+) # asteriks of a list element (.*) # code defining a sitemap node """, re.X) for line in sitemap_text.splitlines(): for regex, depth_start in ((headline_re, 0), (list_re, max_headline_depth)): match = regex.fullmatch(line.strip()) if match: yield { "code": match.group(2).strip(), "depth": depth_start + len(match.group(1)), "children": [] } for an_type, an_prefix in SITEMAP_ANNOTATIONS.items(): if line.startswith(an_prefix): value = remove_prefix(line, an_prefix).strip() yield { "type": "annotation", "value": value, "annotation_type": an_type }
def transform_element(self, obj): if lookup(obj, "attrs", "about") in self._template_ids: return None check(obj, "attrs", "typeof").of([ "mw:Transclusion", "mw:Transclusion mw:Video/Thumb", "mw:Transclusion mw:Image" ]) template = json.loads(obj["attrs"]["data-mw"])["parts"][0] try: template = template["template"] except (TypeError, KeyError): return { "type": "error", "message": "Template spans over several HTML elements." } name = template["target"]["wt"].strip() # labeled section transclusion needs unchanged case. if not name.startswith("#lst:"): name = name.lower() if name != "(!": # Template includes a table afterwards self._template_ids.add(obj["attrs"]["about"]) name = remove_prefix(name, ":mathe für nicht-freaks: vorlage:") params = template["params"] params = {k: v["wt"] for k, v in params.items()} params = {key: self.parse_parameter_value(name, key, value) \ for key, value in params.items() if not params.get(key + "-noprint", False)} # TODO: Find better solution if params.get("noprint", False): return None return {"type": "template", "name": name, "params": params}
def test_remove_prefix(self): self.assertEqual(remove_prefix("aa", "a"), "a") self.assertEqual(remove_prefix("aa", ""), "aa") self.assertEqual(remove_prefix("aa", "aaa"), "aa") self.assertEqual(remove_prefix("a 4 2", "a 4"), " 2") self.assertEqual(remove_prefix("", "a 4"), "")
def transform_template(self, obj): for bname, tname, param_names in BOXSPEC: if obj["name"] == tname: params = { k: self(obj["params"].get(v, None)) for k, v in param_names.items() } return merge(params, {"type": bname}) if obj["name"] == "liste": if "liste" in obj["params"]: sublist = obj["params"]["liste"][0] assert sublist["type"] == "list" items = sublist["items"] ordered = sublist["ordered"] else: items = [{ "type": "listitem", "content": self(x) } for x in obj["params"]["item_list"]] ordered = obj["params"].get("type", "") == "ol" return { "type": "list", "items": items, "ordered": ordered, "spacing": obj["params"].get("abstand", None) } elif obj["name"] == "formel": formula = obj["params"].get("1", []) if len(formula) == 1 and \ lookup(formula, 0, "type") == "inlinemath": formula = formula[0]["formula"] if formula.startswith( "\\begin{align}") and formula.endswith( "\\end{align}"): formula = remove_prefix(formula, "\\begin{align}") formula = remove_suffix(formula, "\\end{align}") formula = "\\begin{align}" + formula + "\\end{align}" return {"type": "equation", "formula": formula} else: message = "Wrong formatted equation" details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)" log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} elif obj["name"] == "(!": return None elif obj["name"].startswith("#invoke:"): # Template is header or footer return None elif obj["name"] == "noprint": return None elif obj["name"] == "todo": message = "Todo-Message in MediaWiki code." details = "Check if this TODO shoud be completed for a book release." log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} else: message = "Parsing of template `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "target": obj, "message": message }
def canonical_image_name(name): name = remove_prefix(name, "./") name = remove_prefix(name, "Datei:") name = remove_prefix(name, "File:") return "File:" + name